Mercurial > repos > greg > update_stag_database
changeset 0:572cf766c59d draft
Uploaded
author | greg |
---|---|
date | Thu, 15 Aug 2019 13:08:33 -0400 |
parents | |
children | f0c4f7f2ecd4 |
files | .shed.yml update_stag_database.py update_stag_database.xml usd_config.ini.sample |
diffstat | 4 files changed, 765 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Thu Aug 15 13:08:33 2019 -0400 @@ -0,0 +1,13 @@ +name: update_stag_database +owner: greg +description: | + Contains a tool that updates the corals (stag) database tables from a dataset collection. +homepage_url: http://baumslab.org +long_description: | + Contains a tool that updates the corals (stag) database tables from a dataset collection where each + item in the collection is a tabular file that will be parsed to insert rows into a table defined by + the name of the file. +remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/corals/update_stag_database +type: unrestricted +categories: + - Micro-array Analysis
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/update_stag_database.py Thu Aug 15 13:08:33 2019 -0400 @@ -0,0 +1,698 @@ +#!/usr/bin/env python +from __future__ import print_function + +import argparse +import datetime +import dateutil.parser +import os +import psycopg2 +import string +import subprocess +import sys + +from six.moves import configparser + +from sqlalchemy import create_engine +from sqlalchemy import MetaData +from sqlalchemy.engine.url import make_url + +now = datetime.datetime.utcnow +metadata = MetaData() + +DEFAULT_MISSING_NUMERIC_VALUE = -9.000000 + + +def check_execution_errors(rc, fstderr, fstdout): + if rc != 0: + fh = open(fstdout, 'rb') + out_msg = fh.read() + fh.close() + fh = open(fstderr, 'rb') + err_msg = fh.read() + fh.close() + msg = '%s\n%s\n' % (str(out_msg), str(err_msg)) + sys.exit(msg) + + +def get_config_settings(config_file, section='defaults'): + # Return a dictionary consisting of the key / value pairs + # of the defaults section of config_file. + d = {} + config_parser = configparser.ConfigParser() + config_parser.read(config_file) + for key, value in config_parser.items(section): + if section == 'defaults': + d[string.upper(key)] = value + else: + d[key] = value + return d + + +def get_response_buffers(): + fstderr = os.path.join(os.getcwd(), 'stderr.txt') + fherr = open(fstderr, 'wb') + fstdout = os.path.join(os.getcwd(), 'stdout.txt') + fhout = open(fstdout, 'wb') + return fstderr, fherr, fstdout, fhout + + +def get_sql_param_val_str(column_val, default): + if set_to_null(column_val): + val = default + else: + val = column_val + return "= '%s'" % val + + +def get_value_from_config(config_defaults, value): + return config_defaults.get(value, None) + + +def get_year_from_now(): + # Get current date plus one year for possible insertion + # into the public_after_date column of the sample table. + # The default behavior is for the value of the public + # column to be True and the public_after_date to be NULL, + # making the sample "public". However, the user can + # set the value of the public column to False and optionally + # set a date after which the sample becomes public in the + # Affymetrix 96 well plate metadata file associated with + # the sample. If the value of the public column is set + # to False, but no date is set, the default date will be 1 + # year from the time the row is inserted into the table. + today = datetime.date.today() + try: + # Return the same day of the year. + year = today.year + 1 + return today.replace(year=year) + except Exception: + # Handle leap years. + return today + (datetime.date(today.year + 1, 1, 1) - datetime.date(today.year, 1, 1)) + + +def handle_column_value(val, get_sql_param=True, default=''): + # Regarding the default value, a NULL value indicates an unknown value + # and typically should not be confused with an empty string. Our application + # does not need the concept of unknown value, so most columns are + # non-nullable and our default is an empty string. + param = handle_null(val) + if get_sql_param: + param_val_str = get_sql_param_val_str(val, default) + if param is None: + if get_sql_param: + return default, param_val_str + return default + if get_sql_param: + return param, param_val_str + return param + + +def handle_null(val): + if set_to_null(val): + return None + return val + + +def run_command(cmd): + fstderr, fherr, fstdout, fhout = get_response_buffers() + proc = subprocess.Popen(args=cmd, stderr=fherr, stdout=fhout, shell=True) + rc = proc.wait() + # Check results. + fherr.close() + fhout.close() + check_execution_errors(rc, fstderr, fstdout) + + +def set_to_null(val): + if val in ["", "NA", "NULL"]: + return True + return False + + +def split_line(line, sep="\t"): + # Remove R quote chars. + items = line.split(sep) + unquoted_items = [] + for item in items: + unquoted_items.append(item.strip('"')) + return unquoted_items + + +def string_as_bool(string): + if str(string).lower() in ('true', 'yes', 'on', '1'): + return True + else: + return False + + +class StagDatabaseUpdater(object): + def __init__(self): + self.args = None + self.conn = None + self.parse_args() + self.year_from_now = get_year_from_now() + self.db_name = None + self.db_storage_dir = None + self.get_config_settings() + self.outfh = open(self.args.output, "w") + self.connect_db() + self.engine = create_engine(self.args.database_connection_string) + self.metadata = MetaData(self.engine) + self.affy_ids = [] + self.allele_ids = [] + self.colony_ids = [] + self.experiment_ids = [] + self.genotype_ids = [] + self.person_ids = [] + self.phenotype_ids = [] + self.reef_ids = [] + self.taxonomy_ids = [] + + def connect_db(self): + url = make_url(self.args.database_connection_string) + self.log('Attempting to connect to the database...') + args = url.translate_connect_args(username='user') + args.update(url.query) + assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' + self.conn = psycopg2.connect(**args) + self.log("Successfully connected to the database...") + + def convert_date_string_for_database(self, date_string): + # The value of date_string is %y/%m/%d with + # the year being 2 digits (yikes!). + fixed_century = "20%s" % date_string + fixed_date = fixed_century.replace("/", "-") + # Convert the string to a format required for + # inserting into the database. + database_format = dateutil.parser.parse(fixed_date) + return str(database_format) + + def flush(self): + self.conn.commit() + + def export_database(self): + # Export the database to the configured storage location. + if not os.path.isdir(self.db_storage_dir): + os.makedirs(self.db_storage_dir) + db_storage_path = os.path.join(self.db_storage_dir, "exported_%s_db" % self.db_name) + cmd = "pg_dump %s -f %s" % (self.db_name, db_storage_path) + run_command(cmd) + + def get_config_settings(self): + config_defaults = get_config_settings(self.args.config_file) + self.db_name = get_value_from_config(config_defaults, 'DB_NAME') + self.db_storage_dir = get_value_from_config(config_defaults, 'DB_STORAGE_DIR') + + def get_next_sample_id(self): + cmd = "SELECT sample_id FROM sample ORDER by id DESC;" + cur = self.conn.cursor() + cur.execute(cmd) + try: + last_sample_id = cur.fetchone()[0] + # The value of last_sample_id will be something like A10171. + last_sample_id_num = int(last_sample_id.lstrip("A")) + next_sample_id_num = last_sample_id_num + 1 + next_sample_id = "A%d" % next_sample_id_num + except Exception: + next_sample_id = "A10000" + return next_sample_id + + def log(self, msg): + self.outfh.write("%s\n" % msg) + + def update_allele_table(self, file_path): + self.log("Updating the allele table...") + # Columns in the experiment file are: + # affy_id allele + allele_table_inserts = 0 + # The allele.tabular file contains a subset of the number of samples + # to be inserted. This is because those samples that failed will not + # be included in the file. Failed samples will have an affy_id value + # of NA in self.affy_ids, which was generated when the genotype.tabular + # file was processed, so we'll use that list to build the correct list + # of self.allele_ids for later use when inserting into the sample table. + fh = open(file_path, "r") + # Skip the header + fh.readline() + for id_index, affy_id in enumerate(self.affy_ids): + if set_to_null(affy_id): + # This is a failed sample, so no allele strings will be + # inserted, and we'll set the allele_id to the default + # empty string. + self.allele_ids.append("") + continue + # See if we need to add a row to the table. The affy_id value + # should not exist in the sample table. + cmd = "SELECT allele_id FROM sample WHERE affy_id = '%s';" % affy_id + cur = self.conn.cursor() + cur.execute(cmd) + try: + allele_id = cur.fetchone()[0] + except Exception: + # Insert a row into the allele table. + line = fh.readline() + line = line.rstrip() + items = split_line(line) + allele = items[1] + cmd = "INSERT INTO allele VALUES (nextval('allele_id_seq'), %s, %s, %s) RETURNING id;" + args = ['NOW()', 'NOW()', allele] + cur = self.update(cmd, args) + self.flush() + allele_id = cur.fetchone()[0] + allele_table_inserts += 1 + self.allele_ids.append(allele_id) + self.log("Inserted %d rows into the allele table..." % allele_table_inserts) + + def update_colony_table(self, file_path): + self.log("Updating the colony table...") + # Columns in the colony file are: + # latitude longitude depth geographic_origin + # The geographic_origin value is used for deciding into which table + # to insert the latitude and longitude values. If the geographic_origin + # is "colony", the values will be inserted into the colony table. + colony_table_inserts = 0 + with open(file_path) as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + # Keep track of foreign keys since we skip the header line. + id_index = i - 1 + line = line.rstrip() + items = split_line(line) + geographic_origin = items[3] + if set_to_null(geographic_origin): + geographic_origin = "reef" + else: + geographic_origin = geographic_origin.lower() + if geographic_origin == "colony": + latitude = "%6f" % float(items[0]) + longitude = "%6f" % float(items[1]) + else: + latitude = DEFAULT_MISSING_NUMERIC_VALUE + longitude = DEFAULT_MISSING_NUMERIC_VALUE + depth = handle_column_value(items[2], get_sql_param=False, default=-9) + reef_id = self.reef_ids[id_index] + # See if we need to add a row to the table. + cmd = "SELECT id FROM colony WHERE latitude = %s " % latitude + cmd += "AND longitude = %s AND depth = %s " % (longitude, depth) + cmd += "AND reef_id = %s;" % reef_id + cur = self.conn.cursor() + cur.execute(cmd) + try: + colony_id = cur.fetchone()[0] + except Exception: + # Insert a row into the colony table. + cmd = "INSERT INTO colony VALUES (nextval('colony_id_seq'), %s, %s, %s, %s, %s, %s) RETURNING id;" + args = ['NOW()', 'NOW()', longitude, latitude, depth, reef_id] + cur = self.update(cmd, args) + self.flush() + colony_id = cur.fetchone()[0] + colony_table_inserts += 1 + self.colony_ids.append(colony_id) + self.log("Inserted %d rows into the colony table..." % colony_table_inserts) + + def update_experiment_table(self, file_path): + self.log("Updating the experiment table...") + # Columns in the experiment file are: + # seq_facility array_version result_folder_name plate_barcode + experiment_table_inserts = 0 + with open(file_path) as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + # Keep track of foreign keys since we skip the header line. + line = line.rstrip() + items = split_line(line) + seq_facility, seq_facility_param_val_str = handle_column_value(items[0]) + array_version, array_version_param_val_str = handle_column_value(items[1]) + result_folder_name, result_folder_name_param_val_str = handle_column_value(items[2]) + plate_barcode, plate_barcode_param_val_str = handle_column_value(items[3]) + # See if we need to add a row to the table. + cmd = "SELECT id FROM experiment WHERE seq_facility %s " % seq_facility_param_val_str + cmd += "AND array_version %s " % array_version_param_val_str + cmd += "AND result_folder_name %s " % result_folder_name_param_val_str + cmd += "AND plate_barcode %s;" % plate_barcode_param_val_str + cur = self.conn.cursor() + cur.execute(cmd) + try: + experiment_id = cur.fetchone()[0] + except Exception: + # Insert a row into the experiment table. + cmd = "INSERT INTO experiment VALUES (nextval('experiment_id_seq'), %s, %s, %s, %s, %s, %s) RETURNING id;" + args = ['NOW()', 'NOW()', seq_facility, array_version, result_folder_name, plate_barcode] + cur = self.update(cmd, args) + self.flush() + experiment_id = cur.fetchone()[0] + experiment_table_inserts += 1 + self.experiment_ids.append(experiment_id) + self.log("Inserted %d rows into the experiment table..." % experiment_table_inserts) + + def update_genotype_table(self, file_path): + self.log("Updating the genotype table...") + # Columns in the genotype file are: + # affy_id coral_mlg_clonal_id user_specimen_id db_match genetic_coral_species_call + # coral_mlg_rep_sample_id + genotype_table_inserts = 0 + with open(file_path) as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + line = line.rstrip() + items = split_line(line) + # Keep an in-memory list of affy_ids for use + # when updating the allele table. + self.affy_ids.append(items[0]) + coral_mlg_clonal_id = items[1] + # The value of db_match will be "no_match" if + # a new row should be inserted into the table. + db_match = items[3].lower() + genetic_coral_species_call = handle_column_value(items[4], get_sql_param=False) + coral_mlg_rep_sample_id = handle_column_value(items[5], get_sql_param=False) + if db_match == "failed": + # Handle the special case of a failed sample. + cmd = "SELECT id FROM genotype WHERE coral_mlg_clonal_id = 'failed'" + else: + cmd = "SELECT id FROM genotype WHERE coral_mlg_clonal_id = '%s'" % coral_mlg_clonal_id + # See if we need to add a row to the table. + cur = self.conn.cursor() + cur.execute(cmd) + try: + genotype_id = cur.fetchone()[0] + if db_match == "failed": + val = db_match + else: + val = "match" + self.log("Found genotype row with id %d, value of db_match: %s, should be %s." % (genotype_id, db_match, val)) + except Exception: + # Insert a row into the genotype table. + cmd = "INSERT INTO genotype VALUES (nextval('genotype_id_seq'), NOW(), NOW(), " + cmd += "'%s', '%s', '%s') RETURNING id;" + cmd = cmd % (coral_mlg_clonal_id, coral_mlg_rep_sample_id, genetic_coral_species_call) + args = [coral_mlg_clonal_id, coral_mlg_rep_sample_id, genetic_coral_species_call] + cur = self.update(cmd, args) + self.flush() + genotype_id = cur.fetchone()[0] + if db_match == "failed": + val = db_match + else: + val = "no_match" + self.log("Inserted genotype row with id %d, value of db_match: %s, should be %s." % (genotype_id, db_match, val)) + genotype_table_inserts += 1 + self.genotype_ids.append(genotype_id) + self.log("Inserted %d rows into the genotype table..." % genotype_table_inserts) + + def update_person_table(self, file_path): + self.log("Updating the person table...") + # Columns in the person file are: + # last_name first_name organization email + person_table_inserts = 0 + with open(file_path) as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + line = line.rstrip() + items = split_line(line) + last_name = items[0] + first_name = items[1] + organization = items[2] + email = items[3] + # See if we need to add a row to the table. + cmd = "SELECT id FROM person WHERE email = '%s';" % email + cur = self.conn.cursor() + cur.execute(cmd) + try: + person_id = cur.fetchone()[0] + except Exception: + # Insert a row into the person table. + cmd = "INSERT INTO person VALUES (nextval('person_id_seq'), NOW(), NOW(), " + cmd += "%s, %s, %s, %s) RETURNING id;" + args = [last_name, first_name, organization, email] + cur = self.update(cmd, args) + self.flush() + person_id = cur.fetchone()[0] + person_table_inserts += 1 + self.person_ids.append(person_id) + self.log("Inserted %d rows into the person table..." % person_table_inserts) + + def update_phenotype_table(self, file_path): + self.log("Updating the phenotype table...") + # Columns in the phenotype file are: + # disease_resist bleach_resist mortality tle spawning sperm_motility healing_time + phenotype_table_inserts = 0 + with open(file_path) as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + line = line.rstrip() + items = split_line(line) + disease_resist, disease_resist_param_val_str = handle_column_value(items[0], default=-9) + bleach_resist, bleach_resist_param_val_str = handle_column_value(items[1], default=-9) + mortality, mortality_param_val_str = handle_column_value(items[2], default=-9) + tle, tle_param_val_str = handle_column_value(items[3], default=-9) + spawning, spawning_param_val_str = handle_column_value(items[4]) + sperm_motility, sperm_motility_param_val_str = handle_column_value(items[5], default=-9.0) + healing_time, healing_time_param_val_str = handle_column_value(items[6], default=-9.0) + # See if we need to add a row to the phenotype table. + cmd = " SELECT id FROM phenotype WHERE disease_resist %s " + cmd += "AND bleach_resist %s AND mortality %s AND tle %s " + cmd += "AND spawning %s AND sperm_motility %s AND healing_time %s;" + cmd = cmd % (disease_resist_param_val_str, bleach_resist_param_val_str, + mortality_param_val_str, tle_param_val_str, spawning_param_val_str, + sperm_motility_param_val_str, healing_time_param_val_str) + cur = self.conn.cursor() + cur.execute(cmd) + try: + phenotype_id = cur.fetchone()[0] + except Exception: + # Insert a row into the phenotype table. + cmd = "INSERT INTO phenotype VALUES (nextval('phenotype_id_seq'), NOW(), NOW(), " + cmd += "%s, %s, %s, %s, %s, %s, %s) RETURNING id;" + args = [disease_resist, bleach_resist, mortality, tle, spawning, sperm_motility, healing_time] + cur = self.update(cmd, args) + self.flush() + phenotype_id = cur.fetchone()[0] + phenotype_table_inserts += 1 + self.phenotype_ids.append(phenotype_id) + self.log("Inserted %d rows into the phenotype table..." % phenotype_table_inserts) + + def update_reef_table(self, file_path): + self.log("Updating the reef table...") + # Columns in the reef file are: + # name region latitude longitude geographic_origin + # The geographic_origin value is used for deciding into which table + # to insert the latitude and longitude values. If the geographic_origin + # is "reef", the values will be inserted into the reef table. + reef_table_inserts = 0 + with open(file_path) as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + line = line.rstrip() + items = split_line(line) + name = items[0] + region = items[1] + geographic_origin = items[4] + if set_to_null(geographic_origin): + geographic_origin = "reef" + else: + geographic_origin = geographic_origin.lower() + if geographic_origin == "reef": + latitude = "%6f" % float(items[2]) + longitude = "%6f" % float(items[3]) + else: + latitude = DEFAULT_MISSING_NUMERIC_VALUE + longitude = DEFAULT_MISSING_NUMERIC_VALUE + # See if we need to add a row to the reef table. + cmd = "SELECT id FROM reef WHERE name = $$%s$$ AND region = '%s' " % (name, region) + cmd += "AND latitude = %s AND longitude = %s " % (latitude, longitude) + cmd += "AND geographic_origin = '%s';" % geographic_origin + cur = self.conn.cursor() + cur.execute(cmd) + try: + reef_id = cur.fetchone()[0] + except Exception: + # Insert a row into the reef table. + cmd = "INSERT INTO reef VALUES (nextval('reef_id_seq'), %s, %s, %s, %s, %s, %s, %s) RETURNING id;" + args = ['NOW()', 'NOW()', name, region, latitude, longitude, geographic_origin] + cur = self.update(cmd, args) + self.flush() + reef_id = cur.fetchone()[0] + reef_table_inserts += 1 + self.reef_ids.append(reef_id) + self.log("Inserted %d rows into the reef table..." % reef_table_inserts) + + def update_sample_table(self, file_path): + self.log("Updating the sample table...") + # Columns in the sample file are: + # affy_id colony_location collection_date user_specimen_id registry_id + # depth dna_extraction_method dna_concentration public public_after_date + # percent_missing_data_coral percent_missing_data_sym percent_reference_coral percent_reference_sym percent_alternative_coral + # percent_alternative_sym percent_heterozygous_coral percent_heterozygous_sym field_call, bcoral_genet_id + sample_table_inserts = 0 + with open(file_path) as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + line = line.rstrip() + # Keep track of foreign keys since we skip the header line. + id_index = i - 1 + items = split_line(line) + sample_id = self.get_next_sample_id() + allele_id = self.allele_ids[id_index] + genotype_id = self.genotype_ids[id_index] + phenotype_id = self.phenotype_ids[id_index] + experiment_id = self.experiment_ids[id_index] + colony_id = self.colony_ids[id_index] + colony_location = handle_column_value(items[1], get_sql_param=False) + taxonomy_id = self.taxonomy_ids[id_index] + collector_id = self.person_ids[id_index] + collection_date = items[2] + user_specimen_id = items[3] + affy_id = handle_column_value(items[0], get_sql_param=False, default="%s_%s" % (sample_id, user_specimen_id)) + registry_id = handle_column_value(items[4], get_sql_param=False, default=-9) + depth = handle_column_value(items[5], get_sql_param=False, default=-9) + dna_extraction_method = handle_column_value(items[6], get_sql_param=False) + dna_concentration = handle_column_value(items[7], get_sql_param=False) + public = items[8] + if string_as_bool(public): + public_after_date = '' + else: + if set_to_null(items[9]): + public_after_date = self.year_from_now + else: + public_after_date = items[9] + percent_missing_data_coral = handle_column_value(items[10], get_sql_param=False) + percent_missing_data_sym = handle_column_value(items[11], get_sql_param=False) + percent_reference_coral = handle_column_value(items[12], get_sql_param=False) + percent_reference_sym = handle_column_value(items[13], get_sql_param=False) + percent_alternative_coral = handle_column_value(items[14], get_sql_param=False) + percent_alternative_sym = handle_column_value(items[15], get_sql_param=False) + percent_heterozygous_coral = handle_column_value(items[16], get_sql_param=False) + percent_heterozygous_sym = handle_column_value(items[17], get_sql_param=False) + field_call = handle_column_value(items[18], get_sql_param=False) + bcoral_genet_id = handle_column_value(items[19], get_sql_param=False) + # Insert a row into the sample table. + cmd = "INSERT INTO sample VALUES (nextval('sample_id_seq'), %s, %s, %s, %s, %s, %s, %s, " + cmd += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " + cmd += "%s, %s, %s, %s) RETURNING id;" + args = ['NOW()', 'NOW()', affy_id, sample_id, allele_id, genotype_id, phenotype_id, + experiment_id, colony_id, colony_location, taxonomy_id, collector_id, + collection_date, user_specimen_id, registry_id, depth, + dna_extraction_method, dna_concentration, public, public_after_date, + percent_missing_data_coral, percent_missing_data_sym, percent_reference_coral, + percent_reference_sym, percent_alternative_coral, percent_alternative_sym, + percent_heterozygous_coral, percent_heterozygous_sym, field_call, bcoral_genet_id] + cur = self.update(cmd, args) + self.flush() + sample_id = cur.fetchone()[0] + sample_table_inserts += 1 + self.log("Inserted %d rows into the sample table..." % sample_table_inserts) + + def update_taxonomy_table(self, file_path): + self.log("Updating the taxonomy table...") + # Columns in the taxonomy file are: + # genetic_coral_species_call affy_id genus_name species_name" + taxonomy_table_inserts = 0 + with open(file_path) as fh: + for i, line in enumerate(fh): + if i == 0: + # Skip header + continue + line = line.rstrip() + items = split_line(line) + genus_name = handle_column_value(items[2], get_sql_param=False, default='unknown') + species_name = handle_column_value(items[3], get_sql_param=False, default='unknown') + # See if we need to add a row to the taxonomy table. + cmd = "SELECT id FROM taxonomy WHERE species_name = '%s' AND genus_name = '%s';" % (species_name, genus_name) + cur = self.conn.cursor() + cur.execute(cmd) + try: + taxonomy_id = cur.fetchone()[0] + except Exception: + # Insert a row into the taxonomy table. + cmd = "INSERT INTO taxonomy VALUES (nextval('taxonomy_id_seq'), %s, %s, %s, %s) RETURNING id;" + args = ['NOW()', 'NOW()', species_name, genus_name] + cur = self.update(cmd, args) + self.flush() + taxonomy_id = cur.fetchone()[0] + taxonomy_table_inserts += 1 + self.taxonomy_ids.append(taxonomy_id) + self.log("Inserted %d rows into the taxonomy table..." % taxonomy_table_inserts) + + def parse_args(self): + parser = argparse.ArgumentParser() + parser.add_argument('--config_file', dest='config_file', help='usd_config.ini'), + parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), + parser.add_argument('--input_dir', dest='input_dir', help='Input datasets for database insertion') + parser.add_argument('--output', dest='output', help='Output dataset'), + self.args = parser.parse_args() + + def run(self): + self.export_database() + input_dir = self.args.input_dir + for file_name in os.listdir(input_dir): + # Tables must be loaded in such a way that foreign keys + # are properly handled. The sample table must be loaded + # last. + if file_name.startswith("allele"): + allele_file = os.path.join(input_dir, file_name) + if file_name.startswith("colony"): + colony_file = os.path.join(input_dir, file_name) + if file_name.startswith("experiment"): + experiment_file = os.path.join(input_dir, file_name) + if file_name.startswith("genotype"): + genotype_file = os.path.join(input_dir, file_name) + elif file_name.startswith("person"): + person_file = os.path.join(input_dir, file_name) + elif file_name.startswith("phenotype"): + phenotype_file = os.path.join(input_dir, file_name) + elif file_name.startswith("reef"): + reef_file = os.path.join(input_dir, file_name) + elif file_name.startswith("sample"): + sample_file = os.path.join(input_dir, file_name) + elif file_name.startswith("taxonomy"): + taxonomy_file = os.path.join(input_dir, file_name) + # Now tables can be loaded in the appropriate order. + self.update_experiment_table(experiment_file) + self.update_genotype_table(genotype_file) + self.update_allele_table(allele_file) + self.update_person_table(person_file) + self.update_phenotype_table(phenotype_file) + self.update_reef_table(reef_file) + self.update_colony_table(colony_file) + self.update_taxonomy_table(taxonomy_file) + self.update_sample_table(sample_file) + + def shutdown(self): + self.log("Shutting down...") + self.outfh.flush() + self.outfh.close() + self.conn.close() + + def update(self, cmd, args): + for i, arg in enumerate(args): + args[i] = handle_null(arg) + try: + cur = self.conn.cursor() + cur.execute(cmd, tuple(args)) + except Exception as e: + msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (cmd.format(args), e) + sys.stderr.write(msg) + self.outfh.flush() + self.outfh.close() + self.conn.close() + sys.exit(1) + return cur + + +if __name__ == '__main__': + sdu = StagDatabaseUpdater() + sdu.run() + sdu.shutdown()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/update_stag_database.xml Thu Aug 15 13:08:33 2019 -0400 @@ -0,0 +1,45 @@ +<tool id="update_stag_database" name="Update stag database" version="1.0.0"> + <description>with new genotyped samples</description> + <command detect_errors="exit_code"><![CDATA[ +#set input_dir = "input_dir" +mkdir $input_dir && +#for $i in $input: + #set filename = $i.file_name + #set name = $i.name + ln -s $filename $input_dir/$name && +#end for +python '$__tool_directory__/update_stag_database.py' +--config_file $__tool_directory__/usd_config.ini +--database_connection_string '$__app__.config.corals_database_connection' +--input_dir '$input_dir' +--output '$output']]></command> + <inputs> + <param name="input" format="tabular" type="data_collection" collection_type="list" label="Tabular datasets for database update"/> + </inputs> + <outputs> + <data name="output" format="txt" label="${tool.name} (process log) on ${on_string}"/> + </outputs> + <tests> + <test> + <!--Testing this tool is a bit difficult at the current time.--> + </test> + </tests> + <help> +**What it does** + +Accepts a set of tabular datasets, one for each appropriate table in the corals (stag) database, and +updates the table with the data. The database must be Postgres, and this tool exports the database +into a file stored in a specified directory on disk before the database is updated. + </help> + <citations> + <citation type="bibtex"> + @misc{None, + journal = {None}, + author = {Baums I}, + title = {Manuscript in preparation}, + year = {None}, + url = {http://baumslab.org} + </citation> + </citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usd_config.ini.sample Thu Aug 15 13:08:33 2019 -0400 @@ -0,0 +1,9 @@ +# Configuration file for the update_stag_database tool. + +[defaults] + +# This section contains default settings for command line parameters that +# can be overridden when they are passed to executed scripts. + +DB_STORAGE_DIR = /tmp/db_store +DB_NAME = stag