Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader
comparison gene_family_scaffold_loader.py @ 3:f56e20e010e2 draft default tip
Uploaded
author | greg |
---|---|
date | Fri, 19 Oct 2018 09:49:23 -0400 |
parents | cb101ec1a0dd |
children |
comparison
equal
deleted
inserted
replaced
2:cb101ec1a0dd | 3:f56e20e010e2 |
---|---|
8 import glob | 8 import glob |
9 import os | 9 import os |
10 import sys | 10 import sys |
11 | 11 |
12 import psycopg2 | 12 import psycopg2 |
13 from sqlalchemy import create_engine, MetaData, Table | |
13 from sqlalchemy.engine.url import make_url | 14 from sqlalchemy.engine.url import make_url |
15 | |
16 BLACKLIST_STRINGS = ['NULL', | |
17 'Unknown protein', | |
18 'No TAIR description', | |
19 'Representative annotation below 0' | |
20 'Representative AHRD below 0'] | |
14 | 21 |
15 | 22 |
16 class ScaffoldLoader(object): | 23 class ScaffoldLoader(object): |
17 def __init__(self): | 24 def __init__(self): |
18 self.args = None | 25 self.args = None |
25 self.species_ids_dict = {} | 32 self.species_ids_dict = {} |
26 self.taxa_lineage_config = None | 33 self.taxa_lineage_config = None |
27 self.parse_args() | 34 self.parse_args() |
28 self.fh = open(self.args.output, "w") | 35 self.fh = open(self.args.output, "w") |
29 self.connect_db() | 36 self.connect_db() |
37 self.engine = create_engine(self.args.database_connection_string) | |
38 self.metadata = MetaData(self.engine) | |
30 | 39 |
31 def parse_args(self): | 40 def parse_args(self): |
32 parser = argparse.ArgumentParser() | 41 parser = argparse.ArgumentParser() |
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | 42 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), |
34 parser.add_argument('--output', dest='output', help='Output dataset'), | 43 parser.add_argument('--output', dest='output', help='Output dataset'), |
101 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate | 110 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate |
102 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables. | 111 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables. |
103 1. Parse all of the *.list files in the same directory to populate | 112 1. Parse all of the *.list files in the same directory to populate |
104 self.scaffold_genes_dict. | 113 self.scaffold_genes_dict. |
105 """ | 114 """ |
115 self.pto_table = Table('plant_tribes_orthogroup', self.metadata, autoload=True) | |
106 scaffold_id = os.path.basename(self.args.scaffold_path) | 116 scaffold_id = os.path.basename(self.args.scaffold_path) |
107 file_dir = os.path.join(self.args.scaffold_path, 'annot') | 117 file_dir = os.path.join(self.args.scaffold_path, 'annot') |
108 # The scaffold naming convention must follow this pattern: | 118 # The scaffold naming convention must follow this pattern: |
109 # <integer1>Gv<integer2>.<integer3> | 119 # <integer1>Gv<integer2>.<integer3> |
110 # where integer 1 is the number of genomes in the scaffold_id. For example: | 120 # where integer 1 is the number of genomes in the scaffold_id. For example: |
149 j_int = int(items[j]) | 159 j_int = int(items[j]) |
150 if j_int > 0: | 160 if j_int > 0: |
151 # The species has at least 1 gene | 161 # The species has at least 1 gene |
152 num_species += 1 | 162 num_species += 1 |
153 num_genes += j_int | 163 num_genes += j_int |
154 # Insert a row into the plant_tribes_orthogroup table. | 164 # Get the auto-incremented row id to insert a row inot |
155 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] | 165 # the plant_tribes_orthogroup table. |
156 for k in range(super_ortho_start_index, len(items)): | 166 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');" |
157 args.append('%s' % str(items[k])) | 167 cur = self.conn.cursor() |
158 sql = """ | 168 cur.execute(sql) |
159 INSERT INTO plant_tribes_orthogroup | 169 plant_tribes_orthogroup_id = cur.fetchone()[0] |
160 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); | 170 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes] |
161 """ | 171 last_item = len(items) |
162 cur = self.update(sql, tuple(args)) | 172 for k in range(super_ortho_start_index, last_item): |
163 self.flush() | 173 bs_found = False |
174 # The last 7 items in this range are as follows. | |
175 # items[last_item-6]: AHRD Descriptions | |
176 # items[last_item-5]: TAIR Gene(s) Descriptions | |
177 # items[last_item-4]: Pfam Domains | |
178 # items[last_item-3]: InterProScan Descriptions | |
179 # items[last_item-2]: GO Molecular Functions | |
180 # items[last_item-1]: GO Biological Processes | |
181 # items[last_item]: GO Cellular Components | |
182 # We'll translate each of these items into a JSON | |
183 # dictionary for inserting into the table. | |
184 if k >= (last_item-7) and k <= last_item: | |
185 json_str = str(items[k]) | |
186 # Here is an example string: | |
187 # Phosphate transporter PHO1 [0.327] | Phosphate | |
188 for bs in BLACKLIST_STRINGS: | |
189 if json_str.find(bs) >= 0: | |
190 bs_found = True | |
191 args.append(None) | |
192 break | |
193 if not bs_found: | |
194 # We'll split the string on " | " to create each value. | |
195 # The keys will be zero-padded integers to enable sorting. | |
196 json_dict = dict() | |
197 json_vals = json_str.split(' | ') | |
198 for key_index, json_val in enumerate(json_vals): | |
199 # The zero-padded key is 1 based. | |
200 json_key = '%04d' % key_index | |
201 json_dict[json_key] = json_val | |
202 args.append(json_dict) | |
203 else: | |
204 args.append('%s' % str(items[k])) | |
205 sql = self.pto_table.insert().values(args) | |
206 try: | |
207 self.engine.execute(sql) | |
208 except Exception as e: | |
209 msg = "Caught exception executing SQL:\n%s\nvalues:\n%s\nException:\n%s\n" % (str(sql), str(args), e) | |
210 self.stop_err(msg) | |
164 i += 1 | 211 i += 1 |
165 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method)) | 212 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method)) |
166 for file_name in glob.glob(os.path.join(file_dir, "*list")): | 213 for file_name in glob.glob(os.path.join(file_dir, "*list")): |
167 items = os.path.basename(file_name).split(".") | 214 items = os.path.basename(file_name).split(".") |
168 clustering_method = items[0] | 215 clustering_method = items[0] |