view export_all_sample_data.py @ 3:4337b52d67cf draft default tip

Uploaded
author greg
date Wed, 11 Nov 2020 18:18:25 +0000
parents 80880af69fe0
children
line wrap: on
line source

#!/usr/bin/env python


import argparse
import datetime

import psycopg2

from sqlalchemy import MetaData
from sqlalchemy import create_engine
from sqlalchemy.engine.url import make_url

metadata = MetaData()

COLUMNS = ["Affymetrix ID", "Sample ID", "User Specimen ID", "Field Call", "Sample Depth",
           "Percent Missing Data Coral", "Percent Heterozygous Coral", "Percent Acerv Coral",
           "Percent Apalm Coral", "Bcoral Genet Id", "Registry ID", "DNA Extraction Method",
           "DNA Concentration", "Colony Location", "Colony Latitude", "Colony Longitude",
           "Colony Depth", "Reef Name", "Region", "Reef Latitude", "Reef Longitude",
           "GPS Coordinates Associated With", "Coral Mlg Clonal ID", "Coral Mlg Rep Sample ID",
           "Genetic Coral Species Call", "Spawning", "Sperm Motility", "TLE", "Disease Resist",
           "Bleach Resist", "Mortality", "Healing Time", "Sequencing Facility", "Array Version",
           "Plate Barcode", "Collector Last Name", "First Name", "Organization", "Email",
           "Collection Date"]


class ExportAllSampleData(object):
    def __init__(self):
        self.args = None
        self.conn = None
        self.parse_args()
        self.outfh = open(self.args.output, "w")
        self.outfh.write("%s\n" % "\t".join(COLUMNS))
        self.connect_db()
        self.engine = create_engine(self.args.database_connection_string)

    def connect_db(self):
        url = make_url(self.args.database_connection_string)
        args = url.translate_connect_args(username='user')
        args.update(url.query)
        assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
        self.conn = psycopg2.connect(**args)

    def export_from_db(self):
        today = datetime.date.today()
        cmd = """
           SELECT sample.affy_id, sample.sample_id, sample.genotype_id,
           sample.phenotype_id, sample.experiment_id, sample.colony_id,
           sample.colony_location, sample.collector_id, sample.collection_date,
           sample.user_specimen_id, sample.registry_id, sample.depth AS sample_depth,
           sample.dna_extraction_method, sample.dna_concentration,
           sample.percent_missing_data_coral, sample.percent_acerv_coral,
           sample.percent_apalm_coral, sample.percent_heterozygous_coral,
           sample.field_call, sample.bcoral_genet_id, genotype.coral_mlg_clonal_id,
           genotype.coral_mlg_rep_sample_id, genotype.genetic_coral_species_call,
           phenotype.spawning, phenotype.sperm_motility, phenotype.tle,
           phenotype.disease_resist, phenotype.bleach_resist, phenotype.mortality,
           phenotype.healing_time, experiment.seq_facility, experiment.array_version,
           experiment.plate_barcode, colony.latitude AS colony_latitude,
           colony.longitude AS colony_longitude, colony.depth AS colony_depth,
           reef.name, reef.region, reef.latitude AS reef_latitude, reef.longitude AS reef_longitude,
           reef.geographic_origin, person.last_name, person.first_name,
           person.organization, person.email
           FROM sample
           LEFT OUTER JOIN genotype
                           ON sample.genotype_id = genotype.id
           LEFT OUTER JOIN phenotype
                           ON sample.phenotype_id = phenotype.id
           LEFT OUTER JOIN experiment
                           ON sample.experiment_id = experiment.id
           LEFT OUTER JOIN colony
                           ON sample.colony_id = colony.id
           LEFT OUTER JOIN reef
                           ON reef.id = colony.reef_id
           LEFT OUTER JOIN person
                           ON sample.collector_id = person.id
           WHERE sample.public OR sample.public_after_date < date'%s'
           ORDER BY affy_id;""" % today
        # Instantiate the cursor.
        cur = self.conn.cursor()
        # Execute the query.
        cur.execute(cmd)
        rows = cur.fetchall()
        for tup in rows:
            values = self.extract_values(tup)
            # Output the row.
            self.outfh.write("%s\n" % "\t".join(values))

    def extract_values(self, tup):
        values = []
        # Extract the items from the tuple.
        affy_id = self.get_value(tup[0])
        sample_id = self.get_value(tup[1])
        colony_location = self.get_value(tup[6])
        collection_date = self.get_value(tup[8])
        if len(collection_date) > 0:
            collection_date = collection_date[:10]
        user_specimen_id = self.get_value(tup[9])
        registry_id = self.get_value(tup[10])
        sample_depth = self.get_value(tup[11])
        dna_extraction_method = self.get_value(tup[12])
        dna_concentration = self.get_value(tup[13])
        percent_missing_data_coral = self.get_value(tup[14])
        percent_acerv_coral = self.get_value(tup[15])
        percent_apalm_coral = self.get_value(tup[16])
        percent_heterozygous_coral = self.get_value(tup[17])
        field_call = self.get_value(tup[18])
        bcoral_genet_id = self.get_value(tup[19])
        coral_mlg_clonal_id = self.get_value(tup[20])
        coral_mlg_rep_sample_id = self.get_value(tup[21])
        genetic_coral_species_call = self.get_value(tup[22])
        spawning = self.get_value(tup[23])
        sperm_motility = self.get_value(tup[24])
        tle = self.get_value(tup[25])
        disease_resist = self.get_value(tup[26])
        bleach_resist = self.get_value(tup[27])
        mortality = self.get_value(tup[28])
        healing_time = self.get_value(tup[29])
        seq_facility = self.get_value(tup[30])
        array_version = self.get_value(tup[31])
        plate_barcode = self.get_value(tup[32])
        colony_latitude = self.get_value(tup[33])
        colony_longitude = self.get_value(tup[34])
        colony_depth = self.get_value(tup[35])
        reef_name = self.get_value(tup[36])
        region = self.get_value(tup[37])
        reef_latitude = self.get_value(tup[38])
        reef_longitude = self.get_value(tup[39])
        geographic_origin = self.get_value(tup[40])
        last_name = self.get_value(tup[41])
        first_name = self.get_value(tup[42])
        organization = self.get_value(tup[43])
        email = self.get_value(tup[44])
        # Append the columns in the specified order.
        values.append(affy_id)
        values.append(sample_id)
        values.append(user_specimen_id)
        values.append(field_call)
        values.append(sample_depth)
        values.append(percent_missing_data_coral)
        values.append(percent_heterozygous_coral)
        values.append(percent_acerv_coral)
        values.append(percent_apalm_coral)
        values.append(bcoral_genet_id)
        values.append(registry_id)
        values.append(dna_extraction_method)
        values.append(dna_concentration)
        values.append(colony_location)
        values.append(colony_latitude)
        values.append(colony_longitude)
        values.append(colony_depth)
        values.append(reef_name)
        values.append(region)
        values.append(reef_latitude)
        values.append(reef_longitude)
        values.append(geographic_origin)
        values.append(coral_mlg_clonal_id)
        values.append(coral_mlg_rep_sample_id)
        values.append(genetic_coral_species_call)
        values.append(spawning)
        values.append(sperm_motility)
        values.append(tle)
        values.append(disease_resist)
        values.append(bleach_resist)
        values.append(mortality)
        values.append(healing_time)
        values.append(seq_facility)
        values.append(array_version)
        values.append(plate_barcode)
        values.append(last_name)
        values.append(first_name)
        values.append(organization)
        values.append(email)
        values.append(collection_date)
        return values

    def get_value(self, loc):
        return str(loc) or ""

    def parse_args(self):
        parser = argparse.ArgumentParser()
        parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
        parser.add_argument('--output', dest='output', help='Output dataset'),
        self.args = parser.parse_args()

    def run(self):
        self.export_from_db()

    def shutdown(self):
        self.outfh.flush()
        self.outfh.close()
        self.conn.close()


if __name__ == '__main__':
    easd = ExportAllSampleData()
    easd.run()
    easd.shutdown()