comparison gene_family_scaffold_loader.py @ 3:f56e20e010e2 draft default tip

Uploaded
author greg
date Fri, 19 Oct 2018 09:49:23 -0400
parents cb101ec1a0dd
children
comparison
equal deleted inserted replaced
2:cb101ec1a0dd 3:f56e20e010e2
8 import glob 8 import glob
9 import os 9 import os
10 import sys 10 import sys
11 11
12 import psycopg2 12 import psycopg2
13 from sqlalchemy import create_engine, MetaData, Table
13 from sqlalchemy.engine.url import make_url 14 from sqlalchemy.engine.url import make_url
15
16 BLACKLIST_STRINGS = ['NULL',
17 'Unknown protein',
18 'No TAIR description',
19 'Representative annotation below 0'
20 'Representative AHRD below 0']
14 21
15 22
16 class ScaffoldLoader(object): 23 class ScaffoldLoader(object):
17 def __init__(self): 24 def __init__(self):
18 self.args = None 25 self.args = None
25 self.species_ids_dict = {} 32 self.species_ids_dict = {}
26 self.taxa_lineage_config = None 33 self.taxa_lineage_config = None
27 self.parse_args() 34 self.parse_args()
28 self.fh = open(self.args.output, "w") 35 self.fh = open(self.args.output, "w")
29 self.connect_db() 36 self.connect_db()
37 self.engine = create_engine(self.args.database_connection_string)
38 self.metadata = MetaData(self.engine)
30 39
31 def parse_args(self): 40 def parse_args(self):
32 parser = argparse.ArgumentParser() 41 parser = argparse.ArgumentParser()
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), 42 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
34 parser.add_argument('--output', dest='output', help='Output dataset'), 43 parser.add_argument('--output', dest='output', help='Output dataset'),
101 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate 110 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
102 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables. 111 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
103 1. Parse all of the *.list files in the same directory to populate 112 1. Parse all of the *.list files in the same directory to populate
104 self.scaffold_genes_dict. 113 self.scaffold_genes_dict.
105 """ 114 """
115 self.pto_table = Table('plant_tribes_orthogroup', self.metadata, autoload=True)
106 scaffold_id = os.path.basename(self.args.scaffold_path) 116 scaffold_id = os.path.basename(self.args.scaffold_path)
107 file_dir = os.path.join(self.args.scaffold_path, 'annot') 117 file_dir = os.path.join(self.args.scaffold_path, 'annot')
108 # The scaffold naming convention must follow this pattern: 118 # The scaffold naming convention must follow this pattern:
109 # <integer1>Gv<integer2>.<integer3> 119 # <integer1>Gv<integer2>.<integer3>
110 # where integer 1 is the number of genomes in the scaffold_id. For example: 120 # where integer 1 is the number of genomes in the scaffold_id. For example:
149 j_int = int(items[j]) 159 j_int = int(items[j])
150 if j_int > 0: 160 if j_int > 0:
151 # The species has at least 1 gene 161 # The species has at least 1 gene
152 num_species += 1 162 num_species += 1
153 num_genes += j_int 163 num_genes += j_int
154 # Insert a row into the plant_tribes_orthogroup table. 164 # Get the auto-incremented row id to insert a row inot
155 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] 165 # the plant_tribes_orthogroup table.
156 for k in range(super_ortho_start_index, len(items)): 166 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');"
157 args.append('%s' % str(items[k])) 167 cur = self.conn.cursor()
158 sql = """ 168 cur.execute(sql)
159 INSERT INTO plant_tribes_orthogroup 169 plant_tribes_orthogroup_id = cur.fetchone()[0]
160 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); 170 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes]
161 """ 171 last_item = len(items)
162 cur = self.update(sql, tuple(args)) 172 for k in range(super_ortho_start_index, last_item):
163 self.flush() 173 bs_found = False
174 # The last 7 items in this range are as follows.
175 # items[last_item-6]: AHRD Descriptions
176 # items[last_item-5]: TAIR Gene(s) Descriptions
177 # items[last_item-4]: Pfam Domains
178 # items[last_item-3]: InterProScan Descriptions
179 # items[last_item-2]: GO Molecular Functions
180 # items[last_item-1]: GO Biological Processes
181 # items[last_item]: GO Cellular Components
182 # We'll translate each of these items into a JSON
183 # dictionary for inserting into the table.
184 if k >= (last_item-7) and k <= last_item:
185 json_str = str(items[k])
186 # Here is an example string:
187 # Phosphate transporter PHO1 [0.327] | Phosphate
188 for bs in BLACKLIST_STRINGS:
189 if json_str.find(bs) >= 0:
190 bs_found = True
191 args.append(None)
192 break
193 if not bs_found:
194 # We'll split the string on " | " to create each value.
195 # The keys will be zero-padded integers to enable sorting.
196 json_dict = dict()
197 json_vals = json_str.split(' | ')
198 for key_index, json_val in enumerate(json_vals):
199 # The zero-padded key is 1 based.
200 json_key = '%04d' % key_index
201 json_dict[json_key] = json_val
202 args.append(json_dict)
203 else:
204 args.append('%s' % str(items[k]))
205 sql = self.pto_table.insert().values(args)
206 try:
207 self.engine.execute(sql)
208 except Exception as e:
209 msg = "Caught exception executing SQL:\n%s\nvalues:\n%s\nException:\n%s\n" % (str(sql), str(args), e)
210 self.stop_err(msg)
164 i += 1 211 i += 1
165 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method)) 212 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
166 for file_name in glob.glob(os.path.join(file_dir, "*list")): 213 for file_name in glob.glob(os.path.join(file_dir, "*list")):
167 items = os.path.basename(file_name).split(".") 214 items = os.path.basename(file_name).split(".")
168 clustering_method = items[0] 215 clustering_method = items[0]