plant_tribes_gene_family_scaffold_loader: gene_family_scaffold

comparison gene_family_scaffold_loader.py @ 3:f56e20e010e2 draft default tip

Uploaded

author	greg
date	Fri, 19 Oct 2018 09:49:23 -0400
parents	cb101ec1a0dd
children

comparison

equal deleted inserted replaced

-:cb101ec1a0dd
+:f56e20e010e2
 import glob
 import os
 import sys
 import psycopg2
+from sqlalchemy import create_engine, MetaData, Table
 from sqlalchemy.engine.url import make_url
+BLACKLIST_STRINGS = ['NULL',
+'Unknown protein',
+'No TAIR description',
+'Representative annotation below 0'
+'Representative AHRD below 0']
 class ScaffoldLoader(object):
 def __init__(self):
 self.args = None
 self.species_ids_dict = {}
 self.taxa_lineage_config = None
 self.parse_args()
 self.fh = open(self.args.output, "w")
 self.connect_db()
+self.engine = create_engine(self.args.database_connection_string)
+self.metadata = MetaData(self.engine)
 def parse_args(self):
 parser = argparse.ArgumentParser()
 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
 parser.add_argument('--output', dest='output', help='Output dataset'),
 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
 1. Parse all of the *.list files in the same directory to populate
 self.scaffold_genes_dict.
 """
+self.pto_table = Table('plant_tribes_orthogroup', self.metadata, autoload=True)
 scaffold_id = os.path.basename(self.args.scaffold_path)
 file_dir = os.path.join(self.args.scaffold_path, 'annot')
 # The scaffold naming convention must follow this pattern:
 # <integer1>Gv<integer2>.<integer3>
 # where integer 1 is the number of genomes in the scaffold_id.  For example:
 j_int = int(items[j])
 if j_int > 0:
 # The  species has at least 1 gene
 num_species += 1
 num_genes += j_int
-# Insert a row into the plant_tribes_orthogroup table.
+# Get the auto-incremented row id to insert a row inot
-args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
+# the plant_tribes_orthogroup table.
-for k in range(super_ortho_start_index, len(items)):
+sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');"
-args.append('%s' % str(items[k]))
+cur = self.conn.cursor()
-sql = """
+cur.execute(sql)
-INSERT INTO plant_tribes_orthogroup
+plant_tribes_orthogroup_id = cur.fetchone()[0]
-VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes]
-"""
+last_item = len(items)
-cur = self.update(sql, tuple(args))
+for k in range(super_ortho_start_index, last_item):
-self.flush()
+bs_found = False
+# The last 7 items in this range are as follows.
+# items[last_item-6]: AHRD Descriptions
+# items[last_item-5]: TAIR Gene(s) Descriptions
+# items[last_item-4]: Pfam Domains
+# items[last_item-3]: InterProScan Descriptions
+# items[last_item-2]: GO Molecular Functions
+# items[last_item-1]: GO Biological Processes
+# items[last_item]: GO Cellular Components
+# We'll translate each of these items into a JSON
+# dictionary for inserting into the table.
+if k >= (last_item-7) and k <= last_item:
+json_str = str(items[k])
+# Here is an example string:
+# Phosphate transporter PHO1 [0.327] | Phosphate
+for bs in BLACKLIST_STRINGS:
+if json_str.find(bs) >= 0:
+bs_found = True
+args.append(None)
+break
+if not bs_found:
+# We'll split the string on " | " to create each value.
+# The keys will be zero-padded integers to enable sorting.
+json_dict = dict()
+json_vals = json_str.split(' | ')
+for key_index, json_val in enumerate(json_vals):
+# The zero-padded key is 1 based.
+json_key = '%04d' % key_index
+json_dict[json_key] = json_val
+args.append(json_dict)
+else:
+args.append('%s' % str(items[k]))
+sql = self.pto_table.insert().values(args)
+try:
+self.engine.execute(sql)
+except Exception as e:
+msg = "Caught exception executing SQL:\n%s\nvalues:\n%s\nException:\n%s\n" % (str(sql), str(args), e)
+self.stop_err(msg)
 i += 1
 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
 for file_name in glob.glob(os.path.join(file_dir, "*list")):
 items = os.path.basename(file_name).split(".")
 clustering_method = items[0]

Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader

comparison gene_family_scaffold_loader.py @ 3:f56e20e010e2 draft default tip