annotate export_all_sample_data.py @ 0:f15e63d204b1 draft

Uploaded
author greg
date Mon, 23 Sep 2019 14:12:31 -0400
parents
children 633cb7831526
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f15e63d204b1 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
f15e63d204b1 Uploaded
greg
parents:
diff changeset
2 from __future__ import print_function
f15e63d204b1 Uploaded
greg
parents:
diff changeset
3
f15e63d204b1 Uploaded
greg
parents:
diff changeset
4 import argparse
f15e63d204b1 Uploaded
greg
parents:
diff changeset
5 import datetime
f15e63d204b1 Uploaded
greg
parents:
diff changeset
6 import psycopg2
f15e63d204b1 Uploaded
greg
parents:
diff changeset
7
f15e63d204b1 Uploaded
greg
parents:
diff changeset
8 from sqlalchemy import create_engine
f15e63d204b1 Uploaded
greg
parents:
diff changeset
9 from sqlalchemy import MetaData
f15e63d204b1 Uploaded
greg
parents:
diff changeset
10 from sqlalchemy.engine.url import make_url
f15e63d204b1 Uploaded
greg
parents:
diff changeset
11
f15e63d204b1 Uploaded
greg
parents:
diff changeset
12 metadata = MetaData()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
13
f15e63d204b1 Uploaded
greg
parents:
diff changeset
14 COLUMNS = ["Affymetrix ID", "Sample ID", "User Specimen ID", "Field Call", "Sample Depth",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
15 "Percent Missing Data Coral", "Percent Heterozygous Coral", "Percent Acerv Coral",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
16 "Percent Apalm Coral", "Bcoral Genet Id", "Registry ID", "DNA Extraction Method",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
17 "DNA Concentration", "Colony Location", "Colony Latitude", "Colony Longitude",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
18 "Colony Depth", "Reef Name", "Region", "Reef Latitude", "Reef Longitude",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
19 "GPS Coordinates Associated With", "Coral Mlg Clonal ID", "Coral Mlg Rep Sample ID",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
20 "Genetic Coral Species Call", "Spawning", "Sperm Motility", "TLE", "Disease Resist",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
21 "Bleach Resist", "Mortality", "Healing Time", "Sequencing Facility", "Array Version",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
22 "Plate Barcode", "Collector Last Name", "First Name", "Organization", "Email",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
23 "Collection Date"]
f15e63d204b1 Uploaded
greg
parents:
diff changeset
24
f15e63d204b1 Uploaded
greg
parents:
diff changeset
25
f15e63d204b1 Uploaded
greg
parents:
diff changeset
26 class ExportAllSampleData(object):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
27 def __init__(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
28 self.args = None
f15e63d204b1 Uploaded
greg
parents:
diff changeset
29 self.conn = None
f15e63d204b1 Uploaded
greg
parents:
diff changeset
30 self.parse_args()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
31 self.outfh = open(self.args.output, "w")
f15e63d204b1 Uploaded
greg
parents:
diff changeset
32 self.outfh.write("%s\n" % "\t".join(COLUMNS))
f15e63d204b1 Uploaded
greg
parents:
diff changeset
33 self.connect_db()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
34 self.engine = create_engine(self.args.database_connection_string)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
35
f15e63d204b1 Uploaded
greg
parents:
diff changeset
36 def connect_db(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
37 url = make_url(self.args.database_connection_string)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
38 args = url.translate_connect_args(username='user')
f15e63d204b1 Uploaded
greg
parents:
diff changeset
39 args.update(url.query)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
40 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
f15e63d204b1 Uploaded
greg
parents:
diff changeset
41 self.conn = psycopg2.connect(**args)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
42
f15e63d204b1 Uploaded
greg
parents:
diff changeset
43 def export_from_db(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
44 today = datetime.date.today()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
45 cmd = """
f15e63d204b1 Uploaded
greg
parents:
diff changeset
46 SELECT sample.affy_id, sample.sample_id, sample.genotype_id,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
47 sample.phenotype_id, sample.experiment_id, sample.colony_id,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
48 sample.colony_location, sample.collector_id, sample.collection_date,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
49 sample.user_specimen_id, sample.registry_id, sample.depth AS sample_depth,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
50 sample.dna_extraction_method, sample.dna_concentration,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
51 sample.percent_missing_data_coral, sample.percent_acerv_coral,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
52 sample.percent_apalm_coral, sample.percent_heterozygous_coral,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
53 sample.field_call, sample.bcoral_genet_id, genotype.coral_mlg_clonal_id,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
54 genotype.coral_mlg_rep_sample_id, genotype.genetic_coral_species_call,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
55 phenotype.spawning, phenotype.sperm_motility, phenotype.tle,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
56 phenotype.disease_resist, phenotype.bleach_resist, phenotype.mortality,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
57 phenotype.healing_time, experiment.seq_facility, experiment.array_version,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
58 experiment.plate_barcode, colony.latitude AS colony_latitude,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
59 colony.longitude AS colony_longitude, colony.depth AS colony_depth,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
60 reef.name, reef.region, reef.latitude AS reef_latitude, reef.longitude AS reef_longitude,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
61 reef.geographic_origin, person.last_name, person.first_name,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
62 person.organization, person.email
f15e63d204b1 Uploaded
greg
parents:
diff changeset
63 FROM sample
f15e63d204b1 Uploaded
greg
parents:
diff changeset
64 LEFT OUTER JOIN genotype
f15e63d204b1 Uploaded
greg
parents:
diff changeset
65 ON sample.genotype_id = genotype.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
66 LEFT OUTER JOIN phenotype
f15e63d204b1 Uploaded
greg
parents:
diff changeset
67 ON sample.phenotype_id = phenotype.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
68 LEFT OUTER JOIN experiment
f15e63d204b1 Uploaded
greg
parents:
diff changeset
69 ON sample.experiment_id = experiment.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
70 LEFT OUTER JOIN colony
f15e63d204b1 Uploaded
greg
parents:
diff changeset
71 ON sample.colony_id = colony.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
72 LEFT OUTER JOIN reef
f15e63d204b1 Uploaded
greg
parents:
diff changeset
73 ON reef.id = colony.reef_id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
74 LEFT OUTER JOIN person
f15e63d204b1 Uploaded
greg
parents:
diff changeset
75 ON sample.collector_id = person.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
76 WHERE sample.public OR sample.public_after_date < date'%s'
f15e63d204b1 Uploaded
greg
parents:
diff changeset
77 ORDER BY affy_id;""" % today
f15e63d204b1 Uploaded
greg
parents:
diff changeset
78 # Instantiate the cursor.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
79 cur = self.conn.cursor()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
80 # Execute the query.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
81 cur.execute(cmd)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
82 rows = cur.fetchall()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
83 for tup in rows:
f15e63d204b1 Uploaded
greg
parents:
diff changeset
84 values = self.extract_values(tup)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
85 # Output the row.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
86 self.outfh.write("%s\n" % "\t".join(values))
f15e63d204b1 Uploaded
greg
parents:
diff changeset
87
f15e63d204b1 Uploaded
greg
parents:
diff changeset
88 def extract_values(self, tup):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
89 values = []
f15e63d204b1 Uploaded
greg
parents:
diff changeset
90 # Extract the items from the tuple.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
91 affy_id = self.get_value(tup[0])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
92 sample_id = self.get_value(tup[1])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
93 colony_location = self.get_value(tup[6])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
94 collection_date = self.get_value(tup[8])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
95 if len(collection_date) > 0:
f15e63d204b1 Uploaded
greg
parents:
diff changeset
96 collection_date = collection_date[:10]
f15e63d204b1 Uploaded
greg
parents:
diff changeset
97 user_specimen_id = self.get_value(tup[9])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
98 registry_id = self.get_value(tup[10])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
99 sample_depth = self.get_value(tup[11])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
100 dna_extraction_method = self.get_value(tup[12])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
101 dna_concentration = self.get_value(tup[13])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
102 percent_missing_data_coral = self.get_value(tup[14])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
103 percent_acerv_coral = self.get_value(tup[15])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
104 percent_apalm_coral = self.get_value(tup[16])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
105 percent_heterozygous_coral = self.get_value(tup[17])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
106 field_call = self.get_value(tup[18])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
107 bcoral_genet_id = self.get_value(tup[19])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
108 coral_mlg_clonal_id = self.get_value(tup[20])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
109 coral_mlg_rep_sample_id = self.get_value(tup[21])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
110 genetic_coral_species_call = self.get_value(tup[22])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
111 spawning = self.get_value(tup[23])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
112 sperm_motility = self.get_value(tup[24])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
113 tle = self.get_value(tup[25])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
114 disease_resist = self.get_value(tup[26])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
115 bleach_resist = self.get_value(tup[27])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
116 mortality = self.get_value(tup[28])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
117 healing_time = self.get_value(tup[29])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
118 seq_facility = self.get_value(tup[30])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
119 array_version = self.get_value(tup[31])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
120 plate_barcode = self.get_value(tup[32])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
121 colony_latitude = self.get_value(tup[33])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
122 colony_longitude = self.get_value(tup[34])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
123 colony_depth = self.get_value(tup[35])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
124 reef_name = self.get_value(tup[36])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
125 region = self.get_value(tup[37])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
126 reef_latitude = self.get_value(tup[38])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
127 reef_longitude = self.get_value(tup[39])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
128 geographic_origin = self.get_value(tup[40])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
129 last_name = self.get_value(tup[41])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
130 first_name = self.get_value(tup[42])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
131 organization = self.get_value(tup[43])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
132 email = self.get_value(tup[44])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
133 # Append the columns in the specified order.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
134 values.append(affy_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
135 values.append(sample_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
136 values.append(user_specimen_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
137 values.append(field_call)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
138 values.append(sample_depth)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
139 values.append(percent_missing_data_coral)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
140 values.append(percent_heterozygous_coral)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
141 values.append(percent_acerv_coral)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
142 values.append(percent_apalm_coral)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
143 values.append(bcoral_genet_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
144 values.append(registry_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
145 values.append(dna_extraction_method)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
146 values.append(dna_concentration)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
147 values.append(colony_location)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
148 values.append(colony_latitude)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
149 values.append(colony_longitude)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
150 values.append(colony_depth)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
151 values.append(reef_name)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
152 values.append(region)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
153 values.append(reef_latitude)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
154 values.append(reef_longitude)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
155 values.append(geographic_origin)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
156 values.append(coral_mlg_clonal_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
157 values.append(coral_mlg_rep_sample_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
158 values.append(genetic_coral_species_call)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
159 values.append(spawning)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
160 values.append(sperm_motility)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
161 values.append(tle)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
162 values.append(disease_resist)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
163 values.append(bleach_resist)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
164 values.append(mortality)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
165 values.append(healing_time)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
166 values.append(seq_facility)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
167 values.append(array_version)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
168 values.append(plate_barcode)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
169 values.append(last_name)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
170 values.append(first_name)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
171 values.append(organization)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
172 values.append(email)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
173 values.append(collection_date)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
174 return values
f15e63d204b1 Uploaded
greg
parents:
diff changeset
175
f15e63d204b1 Uploaded
greg
parents:
diff changeset
176 def get_value(self, loc):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
177 return str(loc) or ""
f15e63d204b1 Uploaded
greg
parents:
diff changeset
178
f15e63d204b1 Uploaded
greg
parents:
diff changeset
179 def parse_args(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
180 parser = argparse.ArgumentParser()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
181 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
f15e63d204b1 Uploaded
greg
parents:
diff changeset
182 parser.add_argument('--output', dest='output', help='Output dataset'),
f15e63d204b1 Uploaded
greg
parents:
diff changeset
183 self.args = parser.parse_args()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
184
f15e63d204b1 Uploaded
greg
parents:
diff changeset
185 def run(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
186 self.export_from_db()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
187
f15e63d204b1 Uploaded
greg
parents:
diff changeset
188 def shutdown(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
189 self.outfh.flush()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
190 self.outfh.close()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
191 self.conn.close()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
192
f15e63d204b1 Uploaded
greg
parents:
diff changeset
193
f15e63d204b1 Uploaded
greg
parents:
diff changeset
194 if __name__ == '__main__':
f15e63d204b1 Uploaded
greg
parents:
diff changeset
195 easd = ExportAllSampleData()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
196 easd.run()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
197 easd.shutdown()