comparison gene_family_scaffold_loader.py @ 0:6282484a52bc draft

Uploaded
author greg
date Tue, 21 Aug 2018 12:57:00 -0400
parents
children cb101ec1a0dd
comparison
equal deleted inserted replaced
-1:000000000000 0:6282484a52bc
1 #!/usr/bin/env python
2 """
3 add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes
4 database efficiently by bypassing the Galaxy model and operating directly on the database.
5 PostgreSQL 9.1 or greater is required.
6 """
7 import argparse
8 import glob
9 import os
10 import sys
11
12 import psycopg2
13 from sqlalchemy.engine.url import make_url
14
15
16 class ScaffoldLoader(object):
17 def __init__(self):
18 self.args = None
19 self.clustering_methods = []
20 self.conn = None
21 self.gene_sequences_dict = {}
22 self.scaffold_genes_dict = {}
23 self.scaffold_recs = []
24 self.species_genes_dict = {}
25 self.species_ids_dict = {}
26 self.taxa_lineage_config = None
27 self.parse_args()
28 self.fh = open(self.args.output, "w")
29 self.connect_db()
30
31 def parse_args(self):
32 parser = argparse.ArgumentParser()
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
34 parser.add_argument('--output', dest='output', help='Output dataset'),
35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory')
36 self.args = parser.parse_args()
37
38 def connect_db(self):
39 url = make_url(self.args.database_connection_string)
40 self.log('Connecting to database with URL: %s' % url)
41 args = url.translate_connect_args(username='user')
42 args.update(url.query)
43 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
44 self.conn = psycopg2.connect(**args)
45
46 def flush(self):
47 self.conn.commit()
48
49 def shutdown(self):
50 self.conn.close()
51
52 def update(self, sql, args):
53 try:
54 cur = self.conn.cursor()
55 cur.execute(sql, args)
56 except Exception as e:
57 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e)
58 self.stop_err(msg)
59 return cur
60
61 def stop_err(self, msg):
62 sys.stderr.write(msg)
63 self.fh.flush()
64 self.fh.close()
65 sys.exit(1)
66
67 def log(self, msg):
68 self.fh.write("%s\n" % msg)
69 self.fh.flush()
70
71 @property
72 def can_add_scaffold(self):
73 """
74 Make sure the scaffold has not already been added.
75 """
76 scaffold_id = os.path.basename(self.args.scaffold_path)
77 sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id
78 cur = self.conn.cursor()
79 cur.execute(sql)
80 try:
81 cur.fetchone()[0]
82 # The scaffold has been added to the database.
83 return False
84 except:
85 # The scaffold has not yet been added.
86 return True
87
88 def run(self):
89 if self.can_add_scaffold:
90 self.process_annot_dir()
91 self.process_scaffold_config_files()
92 self.process_orthogroup_fasta_files()
93 self.fh.flush()
94 self.fh.close()
95 else:
96 self.stop_err("The scaffold %s has already been added to the database." % os.path.basename(self.args.scaffold_path))
97
98 def process_annot_dir(self):
99 """
100 1. Parse all of the *.min_evalue.summary files in the
101 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
102 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
103 1. Parse all of the *.list files in the same directory to populate
104 self.scaffold_genes_dict.
105 """
106 scaffold_id = os.path.basename(self.args.scaffold_path)
107 file_dir = os.path.join(self.args.scaffold_path, 'annot')
108 # The scaffol naming convention must follow this pattern:
109 # <integer1>Gv<integer2>.<integer3>
110 # where integer 1 is the number of genomes in the scaffold_id. For example:
111 # 22Gv1.1 -> 22 genomes
112 # 12Gv1.0 -> 12 genomes
113 # 26Gv2.0 -> 26 genomes, etc.
114 num_genomes = int(scaffold_id.split("Gv")[0])
115 super_ortho_start_index = num_genomes + 1
116 for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")):
117 items = os.path.basename(file_name).split(".")
118 clustering_method = items[0]
119 # Save all clustering methods for later processing.
120 if clustering_method not in self.clustering_methods:
121 self.clustering_methods.append(clustering_method)
122 # Insert a row in to the plant_tribes_scaffold table.
123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method))
124 args = [scaffold_id, clustering_method]
125 sql = """
126 INSERT INTO plant_tribes_scaffold
127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
128 RETURNING id;
129 """
130 cur = self.update(sql, tuple(args))
131 self.flush()
132 scaffold_id_db = cur.fetchone()[0]
133 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
134 with open(file_name, "r") as fh:
135 i = 0
136 for i2, line in enumerate(fh):
137 if i2 == 0:
138 # Skip first line.
139 continue
140 num_genes = 0
141 num_species = 0
142 items = line.split("\t")
143 orthogroup_id = int(items[0])
144 # Zero based items 1 to num_genomes consists of the
145 # number of species classified in the orthogroup (i.e.,
146 # species with at least 1 gene in the orthogroup).
147 for j in range(1, num_genomes):
148 j_int = int(items[j])
149 if j_int > 0:
150 # The species has at least 1 gene
151 num_species += 1
152 num_genes += j_int
153 # Insert a row into the plant_tribes_orthogroup table.
154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
155 for k in range(super_ortho_start_index, len(items)):
156 args.append('%s' % str(items[k]))
157 sql = """
158 INSERT INTO plant_tribes_orthogroup
159 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
160 """
161 cur = self.update(sql, tuple(args))
162 self.flush()
163 i += 1
164 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
165 for file_name in glob.glob(os.path.join(file_dir, "*list")):
166 items = os.path.basename(file_name).split(".")
167 clustering_method = items[0]
168 with open(file_name, "r") as fh:
169 for i, line in enumerate(fh):
170 items = line.split("\t")
171 # The key will be a combination of clustering_method and
172 # orthogroup_id separated by "^^" for easy splitting later.
173 key = "%s^^%s" % (clustering_method, items[0])
174 # The value is the gen_id with all white space replaced by "_".
175 val = items[1].replace("|", "_")
176 if key in self.scaffold_genes_dict:
177 self.scaffold_genes_dict[key].append(val)
178 else:
179 self.scaffold_genes_dict[key] = [val]
180
181 def process_scaffold_config_files(self):
182 """
183 1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config
184 (e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate.
185 2. Calculate the number of genes found
186 for each species and add the number to self.species_genes_dict.
187 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to
188 populate the plant_tribes_taxon table.
189 """
190 scaffold_id = os.path.basename(self.args.scaffold_path)
191 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id)
192 self.log("Processing rooting order config: %s" % str(file_name))
193 # Populate self.species_ids_dict.
194 with open(file_name, "r") as fh:
195 for i, line in enumerate(fh):
196 line = line.strip()
197 if len(line) == 0 or line.startswith("#") or line.startswith("["):
198 # Skip blank lines, comments and section headers.
199 continue
200 # Example line:
201 # Physcomitrella patens=Phypa
202 items = line.split("=")
203 self.species_ids_dict[items[1]] = items[0]
204 # Get lineage information for orthogrpoup taxa.
205 for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()):
206 # The format of key is <clustering_method>^^<orthogroup_id>.
207 # For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001"
208 scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^")
209 clustering_method = scaffold_genes_dict_key_items[0]
210 # Get the list of genes for the current scaffold_genes_dict_key.
211 gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key]
212 for gene_id in gene_list:
213 # Example species_code: Musac1.0, where
214 # species_name is Musac and version is 1.0.
215 species_code = gene_id.split("_")[1]
216 # Strip the version from the species_code.
217 species_code = species_code[0:5]
218 # Get the species_name from self.species_ids_dict.
219 species_name = self.species_ids_dict[species_code]
220 # Create a key for self.species_genes_dict, with the format:
221 # <clustering_method>^^<species_name>
222 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
223 # Add an entry to self.species_genes_dict, where the value
224 # is a list containing species_name and num_genes.
225 if species_genes_dict_key in self.species_genes_dict:
226 tup = self.species_genes_dict[species_genes_dict_key]
227 tup[1] += 1
228 self.species_genes_dict[species_genes_dict_key] = tup
229 else:
230 self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
231 # Populate the plant_tribes_taxon table.
232 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
233 self.log("Processing taxa lineage config: %s" % str(file_name))
234 with open(file_name, "r") as fh:
235 for line in fh:
236 line = line.strip()
237 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
238 # Skip blank lines, comments and section headers.
239 continue
240 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
241 items = line.split("\t")
242 species_name = items[0]
243 i = 0
244 for clustering_method in self.clustering_methods:
245 # The format of species_genes_dict_key is <clustering_method>^^<species_name>.
246 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
247 # Get the scaffold_rec for the current scaffold_id and clustering_method.
248 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
249 for scaffold_rec in self.scaffold_recs:
250 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
251 scaffold_id_db = scaffold_rec[0]
252 # The value is a list containing species_name and num_genes.
253 val = self.species_genes_dict[species_genes_dict_key]
254 if species_name == val[0]:
255 num_genes = val[1]
256 else:
257 num_genes = 0
258 # Insert a row in to the plant_tribes_scaffold table.
259 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]]
260 sql = """
261 INSERT INTO plant_tribes_taxon
262 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
263 """
264 self.update(sql, tuple(args))
265 self.flush()
266 i += 1
267 self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % (i, str(species_name)))
268
269 def process_orthogroup_fasta_files(self):
270 """
271 1. Analyze all of the scaffold .fna and .faa files for each clustering
272 method to populate the aa_dict and dna_dict sequence dictionaries.
273 2. Use the populated sequence dictionaries to populate the plant_tribes_gene
274 and gene_scaffold_orthogroup_taxon_association tables.
275 """
276 scaffold_id = os.path.basename(self.args.scaffold_path)
277 aa_dict = {}
278 dna_dict = {}
279 # Populate aa_dict and dna_dict.
280 for clustering_method in self.clustering_methods:
281 file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method)
282 for file_name in os.listdir(file_dir):
283 items = file_name.split(".")
284 orthogroup_id = items[0]
285 file_extension = items[1]
286 if file_extension == "fna":
287 adict = dna_dict
288 else:
289 adict = aa_dict
290 file_path = os.path.join(file_dir, file_name)
291 with open(file_path, "r") as fh:
292 for i, line in enumerate(fh):
293 line = line.strip()
294 if len(line) == 0:
295 # Skip blank lines (shoudn't happen).
296 continue
297 if line.startswith(">"):
298 # Example line:
299 # >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110
300 gene_id = line.lstrip(">")
301 # The dictionary keys will combine the orthogroup_id,
302 # clustering method and gene id using the format
303 # ,orthogroup_id>^^<clustering_method>^^<gene_id>.
304 combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id)
305 if combined_id not in adict:
306 # The value will be the dna sequence string..
307 adict[combined_id] = ""
308 else:
309 # Example line:
310 # ATGGAGAAGGACTTT
311 # Here combined_id is set because the fasta format specifies
312 # that all lines following the gene id defined in the if block
313 # above will be the sequence associated with that gene until
314 # the next gene id line is encountered.
315 sequence = adict[combined_id]
316 sequence = "%s%s" % (sequence, line)
317 adict[combined_id] = sequence
318 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables
319 # from the contents of aa_dict and dna_dict.
320 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.")
321 gi = 0
322 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
323 # The dictionary keys combine the orthogroup_id, clustering method and
324 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
325 items = combined_id.split("^^")
326 orthogroup_id = items[0]
327 clustering_method = items[1]
328 gene_id = items[2]
329 # The value will be a list containing both
330 # clustering_method and the dna string.
331 dna_sequence = dna_dict[combined_id]
332 aa_sequence = aa_dict[combined_id]
333 # Get the species_code from the gene_id.
334 species_code = gene_id.split("_")[1]
335 # Strip the version from the species_code.
336 species_code = species_code[0:5]
337 # Get the species_name from self.species_ids_dict.
338 species_name = self.species_ids_dict[species_code]
339 # Get the plant_tribes_orthogroup primary key id for
340 # the orthogroup_id from the plant_tribes_orthogroup table.
341 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id
342 cur = self.conn.cursor()
343 cur.execute(sql)
344 orthogroup_id_db = cur.fetchone()[0]
345 # If the plant_tribes_gene table contains a row that has the gene_id,
346 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
347 # Get the taxon_id for the species_name from the plant_tribes_taxon table.
348 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name
349 cur = self.conn.cursor()
350 cur.execute(sql)
351 taxon_id_db = cur.fetchone()[0]
352 # If the plant_tribes_gene table contains a row that has the gene_id,
353 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
354 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id
355 cur = self.conn.cursor()
356 cur.execute(sql)
357 try:
358 gene_id_db = cur.fetchone()[0]
359 except:
360 # Insert a row into the plant_tribes_gene table.
361 args = [gene_id, dna_sequence, aa_sequence]
362 sql = """
363 INSERT INTO plant_tribes_gene
364 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s)
365 RETURNING id;
366 """
367 cur = self.update(sql, tuple(args))
368 self.flush()
369 gene_id_db = cur.fetchone()[0]
370 gi += 1
371 if gi % 1000 == 0:
372 self.log("Inserted 1000 more rows into the plant_tribes_gene table.")
373 # Insert a row into the gene_scaffold_orthogroup_taxon_association table.
374 # Get the scaffold_rec for the current scaffold_id and clustering_method.
375 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
376 for scaffold_rec in self.scaffold_recs:
377 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
378 scaffold_id_db = scaffold_rec[0]
379 args = [gene_id_db, scaffold_id_db, orthogroup_id_db, taxon_id_db]
380 sql = """
381 INSERT INTO gene_scaffold_orthogroup_taxon_association
382 VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s);
383 """
384 cur = self.update(sql, tuple(args))
385 self.flush()
386 if gsoai % 1000 == 0:
387 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.")
388 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
389 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai)
390
391
392 if __name__ == '__main__':
393 scaffold_loader = ScaffoldLoader()
394 scaffold_loader.run()
395 scaffold_loader.shutdown()