comparison export_all_sample_data.py @ 0:f15e63d204b1 draft

Uploaded
author greg
date Mon, 23 Sep 2019 14:12:31 -0400
parents
children 633cb7831526
comparison
equal deleted inserted replaced
-1:000000000000 0:f15e63d204b1
1 #!/usr/bin/env python
2 from __future__ import print_function
3
4 import argparse
5 import datetime
6 import psycopg2
7
8 from sqlalchemy import create_engine
9 from sqlalchemy import MetaData
10 from sqlalchemy.engine.url import make_url
11
12 metadata = MetaData()
13
14 COLUMNS = ["Affymetrix ID", "Sample ID", "User Specimen ID", "Field Call", "Sample Depth",
15 "Percent Missing Data Coral", "Percent Heterozygous Coral", "Percent Acerv Coral",
16 "Percent Apalm Coral", "Bcoral Genet Id", "Registry ID", "DNA Extraction Method",
17 "DNA Concentration", "Colony Location", "Colony Latitude", "Colony Longitude",
18 "Colony Depth", "Reef Name", "Region", "Reef Latitude", "Reef Longitude",
19 "GPS Coordinates Associated With", "Coral Mlg Clonal ID", "Coral Mlg Rep Sample ID",
20 "Genetic Coral Species Call", "Spawning", "Sperm Motility", "TLE", "Disease Resist",
21 "Bleach Resist", "Mortality", "Healing Time", "Sequencing Facility", "Array Version",
22 "Plate Barcode", "Collector Last Name", "First Name", "Organization", "Email",
23 "Collection Date"]
24
25
26 class ExportAllSampleData(object):
27 def __init__(self):
28 self.args = None
29 self.conn = None
30 self.parse_args()
31 self.outfh = open(self.args.output, "w")
32 self.outfh.write("%s\n" % "\t".join(COLUMNS))
33 self.connect_db()
34 self.engine = create_engine(self.args.database_connection_string)
35
36 def connect_db(self):
37 url = make_url(self.args.database_connection_string)
38 args = url.translate_connect_args(username='user')
39 args.update(url.query)
40 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
41 self.conn = psycopg2.connect(**args)
42
43 def export_from_db(self):
44 today = datetime.date.today()
45 cmd = """
46 SELECT sample.affy_id, sample.sample_id, sample.genotype_id,
47 sample.phenotype_id, sample.experiment_id, sample.colony_id,
48 sample.colony_location, sample.collector_id, sample.collection_date,
49 sample.user_specimen_id, sample.registry_id, sample.depth AS sample_depth,
50 sample.dna_extraction_method, sample.dna_concentration,
51 sample.percent_missing_data_coral, sample.percent_acerv_coral,
52 sample.percent_apalm_coral, sample.percent_heterozygous_coral,
53 sample.field_call, sample.bcoral_genet_id, genotype.coral_mlg_clonal_id,
54 genotype.coral_mlg_rep_sample_id, genotype.genetic_coral_species_call,
55 phenotype.spawning, phenotype.sperm_motility, phenotype.tle,
56 phenotype.disease_resist, phenotype.bleach_resist, phenotype.mortality,
57 phenotype.healing_time, experiment.seq_facility, experiment.array_version,
58 experiment.plate_barcode, colony.latitude AS colony_latitude,
59 colony.longitude AS colony_longitude, colony.depth AS colony_depth,
60 reef.name, reef.region, reef.latitude AS reef_latitude, reef.longitude AS reef_longitude,
61 reef.geographic_origin, person.last_name, person.first_name,
62 person.organization, person.email
63 FROM sample
64 LEFT OUTER JOIN genotype
65 ON sample.genotype_id = genotype.id
66 LEFT OUTER JOIN phenotype
67 ON sample.phenotype_id = phenotype.id
68 LEFT OUTER JOIN experiment
69 ON sample.experiment_id = experiment.id
70 LEFT OUTER JOIN colony
71 ON sample.colony_id = colony.id
72 LEFT OUTER JOIN reef
73 ON reef.id = colony.reef_id
74 LEFT OUTER JOIN person
75 ON sample.collector_id = person.id
76 WHERE sample.public OR sample.public_after_date < date'%s'
77 ORDER BY affy_id;""" % today
78 # Instantiate the cursor.
79 cur = self.conn.cursor()
80 # Execute the query.
81 cur.execute(cmd)
82 rows = cur.fetchall()
83 for tup in rows:
84 values = self.extract_values(tup)
85 # Output the row.
86 self.outfh.write("%s\n" % "\t".join(values))
87
88 def extract_values(self, tup):
89 values = []
90 # Extract the items from the tuple.
91 affy_id = self.get_value(tup[0])
92 sample_id = self.get_value(tup[1])
93 colony_location = self.get_value(tup[6])
94 collection_date = self.get_value(tup[8])
95 if len(collection_date) > 0:
96 collection_date = collection_date[:10]
97 user_specimen_id = self.get_value(tup[9])
98 registry_id = self.get_value(tup[10])
99 sample_depth = self.get_value(tup[11])
100 dna_extraction_method = self.get_value(tup[12])
101 dna_concentration = self.get_value(tup[13])
102 percent_missing_data_coral = self.get_value(tup[14])
103 percent_acerv_coral = self.get_value(tup[15])
104 percent_apalm_coral = self.get_value(tup[16])
105 percent_heterozygous_coral = self.get_value(tup[17])
106 field_call = self.get_value(tup[18])
107 bcoral_genet_id = self.get_value(tup[19])
108 coral_mlg_clonal_id = self.get_value(tup[20])
109 coral_mlg_rep_sample_id = self.get_value(tup[21])
110 genetic_coral_species_call = self.get_value(tup[22])
111 spawning = self.get_value(tup[23])
112 sperm_motility = self.get_value(tup[24])
113 tle = self.get_value(tup[25])
114 disease_resist = self.get_value(tup[26])
115 bleach_resist = self.get_value(tup[27])
116 mortality = self.get_value(tup[28])
117 healing_time = self.get_value(tup[29])
118 seq_facility = self.get_value(tup[30])
119 array_version = self.get_value(tup[31])
120 plate_barcode = self.get_value(tup[32])
121 colony_latitude = self.get_value(tup[33])
122 colony_longitude = self.get_value(tup[34])
123 colony_depth = self.get_value(tup[35])
124 reef_name = self.get_value(tup[36])
125 region = self.get_value(tup[37])
126 reef_latitude = self.get_value(tup[38])
127 reef_longitude = self.get_value(tup[39])
128 geographic_origin = self.get_value(tup[40])
129 last_name = self.get_value(tup[41])
130 first_name = self.get_value(tup[42])
131 organization = self.get_value(tup[43])
132 email = self.get_value(tup[44])
133 # Append the columns in the specified order.
134 values.append(affy_id)
135 values.append(sample_id)
136 values.append(user_specimen_id)
137 values.append(field_call)
138 values.append(sample_depth)
139 values.append(percent_missing_data_coral)
140 values.append(percent_heterozygous_coral)
141 values.append(percent_acerv_coral)
142 values.append(percent_apalm_coral)
143 values.append(bcoral_genet_id)
144 values.append(registry_id)
145 values.append(dna_extraction_method)
146 values.append(dna_concentration)
147 values.append(colony_location)
148 values.append(colony_latitude)
149 values.append(colony_longitude)
150 values.append(colony_depth)
151 values.append(reef_name)
152 values.append(region)
153 values.append(reef_latitude)
154 values.append(reef_longitude)
155 values.append(geographic_origin)
156 values.append(coral_mlg_clonal_id)
157 values.append(coral_mlg_rep_sample_id)
158 values.append(genetic_coral_species_call)
159 values.append(spawning)
160 values.append(sperm_motility)
161 values.append(tle)
162 values.append(disease_resist)
163 values.append(bleach_resist)
164 values.append(mortality)
165 values.append(healing_time)
166 values.append(seq_facility)
167 values.append(array_version)
168 values.append(plate_barcode)
169 values.append(last_name)
170 values.append(first_name)
171 values.append(organization)
172 values.append(email)
173 values.append(collection_date)
174 return values
175
176 def get_value(self, loc):
177 return str(loc) or ""
178
179 def parse_args(self):
180 parser = argparse.ArgumentParser()
181 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
182 parser.add_argument('--output', dest='output', help='Output dataset'),
183 self.args = parser.parse_args()
184
185 def run(self):
186 self.export_from_db()
187
188 def shutdown(self):
189 self.outfh.flush()
190 self.outfh.close()
191 self.conn.close()
192
193
194 if __name__ == '__main__':
195 easd = ExportAllSampleData()
196 easd.run()
197 easd.shutdown()