annotate export_all_sample_data.py @ 2:80880af69fe0 draft

Uploaded
author greg
date Wed, 26 Aug 2020 13:47:04 -0400
parents 633cb7831526
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f15e63d204b1 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
1
633cb7831526 Uploaded
greg
parents: 0
diff changeset
2
0
f15e63d204b1 Uploaded
greg
parents:
diff changeset
3
f15e63d204b1 Uploaded
greg
parents:
diff changeset
4 import argparse
f15e63d204b1 Uploaded
greg
parents:
diff changeset
5 import datetime
2
80880af69fe0 Uploaded
greg
parents: 1
diff changeset
6
0
f15e63d204b1 Uploaded
greg
parents:
diff changeset
7 import psycopg2
f15e63d204b1 Uploaded
greg
parents:
diff changeset
8
2
80880af69fe0 Uploaded
greg
parents: 1
diff changeset
9 from sqlalchemy import MetaData
0
f15e63d204b1 Uploaded
greg
parents:
diff changeset
10 from sqlalchemy import create_engine
f15e63d204b1 Uploaded
greg
parents:
diff changeset
11 from sqlalchemy.engine.url import make_url
f15e63d204b1 Uploaded
greg
parents:
diff changeset
12
f15e63d204b1 Uploaded
greg
parents:
diff changeset
13 metadata = MetaData()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
14
f15e63d204b1 Uploaded
greg
parents:
diff changeset
15 COLUMNS = ["Affymetrix ID", "Sample ID", "User Specimen ID", "Field Call", "Sample Depth",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
16 "Percent Missing Data Coral", "Percent Heterozygous Coral", "Percent Acerv Coral",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
17 "Percent Apalm Coral", "Bcoral Genet Id", "Registry ID", "DNA Extraction Method",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
18 "DNA Concentration", "Colony Location", "Colony Latitude", "Colony Longitude",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
19 "Colony Depth", "Reef Name", "Region", "Reef Latitude", "Reef Longitude",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
20 "GPS Coordinates Associated With", "Coral Mlg Clonal ID", "Coral Mlg Rep Sample ID",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
21 "Genetic Coral Species Call", "Spawning", "Sperm Motility", "TLE", "Disease Resist",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
22 "Bleach Resist", "Mortality", "Healing Time", "Sequencing Facility", "Array Version",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
23 "Plate Barcode", "Collector Last Name", "First Name", "Organization", "Email",
f15e63d204b1 Uploaded
greg
parents:
diff changeset
24 "Collection Date"]
f15e63d204b1 Uploaded
greg
parents:
diff changeset
25
f15e63d204b1 Uploaded
greg
parents:
diff changeset
26
f15e63d204b1 Uploaded
greg
parents:
diff changeset
27 class ExportAllSampleData(object):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
28 def __init__(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
29 self.args = None
f15e63d204b1 Uploaded
greg
parents:
diff changeset
30 self.conn = None
f15e63d204b1 Uploaded
greg
parents:
diff changeset
31 self.parse_args()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
32 self.outfh = open(self.args.output, "w")
f15e63d204b1 Uploaded
greg
parents:
diff changeset
33 self.outfh.write("%s\n" % "\t".join(COLUMNS))
f15e63d204b1 Uploaded
greg
parents:
diff changeset
34 self.connect_db()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
35 self.engine = create_engine(self.args.database_connection_string)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
36
f15e63d204b1 Uploaded
greg
parents:
diff changeset
37 def connect_db(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
38 url = make_url(self.args.database_connection_string)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
39 args = url.translate_connect_args(username='user')
f15e63d204b1 Uploaded
greg
parents:
diff changeset
40 args.update(url.query)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
41 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
f15e63d204b1 Uploaded
greg
parents:
diff changeset
42 self.conn = psycopg2.connect(**args)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
43
f15e63d204b1 Uploaded
greg
parents:
diff changeset
44 def export_from_db(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
45 today = datetime.date.today()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
46 cmd = """
f15e63d204b1 Uploaded
greg
parents:
diff changeset
47 SELECT sample.affy_id, sample.sample_id, sample.genotype_id,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
48 sample.phenotype_id, sample.experiment_id, sample.colony_id,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
49 sample.colony_location, sample.collector_id, sample.collection_date,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
50 sample.user_specimen_id, sample.registry_id, sample.depth AS sample_depth,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
51 sample.dna_extraction_method, sample.dna_concentration,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
52 sample.percent_missing_data_coral, sample.percent_acerv_coral,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
53 sample.percent_apalm_coral, sample.percent_heterozygous_coral,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
54 sample.field_call, sample.bcoral_genet_id, genotype.coral_mlg_clonal_id,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
55 genotype.coral_mlg_rep_sample_id, genotype.genetic_coral_species_call,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
56 phenotype.spawning, phenotype.sperm_motility, phenotype.tle,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
57 phenotype.disease_resist, phenotype.bleach_resist, phenotype.mortality,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
58 phenotype.healing_time, experiment.seq_facility, experiment.array_version,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
59 experiment.plate_barcode, colony.latitude AS colony_latitude,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
60 colony.longitude AS colony_longitude, colony.depth AS colony_depth,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
61 reef.name, reef.region, reef.latitude AS reef_latitude, reef.longitude AS reef_longitude,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
62 reef.geographic_origin, person.last_name, person.first_name,
f15e63d204b1 Uploaded
greg
parents:
diff changeset
63 person.organization, person.email
f15e63d204b1 Uploaded
greg
parents:
diff changeset
64 FROM sample
f15e63d204b1 Uploaded
greg
parents:
diff changeset
65 LEFT OUTER JOIN genotype
f15e63d204b1 Uploaded
greg
parents:
diff changeset
66 ON sample.genotype_id = genotype.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
67 LEFT OUTER JOIN phenotype
f15e63d204b1 Uploaded
greg
parents:
diff changeset
68 ON sample.phenotype_id = phenotype.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
69 LEFT OUTER JOIN experiment
f15e63d204b1 Uploaded
greg
parents:
diff changeset
70 ON sample.experiment_id = experiment.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
71 LEFT OUTER JOIN colony
f15e63d204b1 Uploaded
greg
parents:
diff changeset
72 ON sample.colony_id = colony.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
73 LEFT OUTER JOIN reef
f15e63d204b1 Uploaded
greg
parents:
diff changeset
74 ON reef.id = colony.reef_id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
75 LEFT OUTER JOIN person
f15e63d204b1 Uploaded
greg
parents:
diff changeset
76 ON sample.collector_id = person.id
f15e63d204b1 Uploaded
greg
parents:
diff changeset
77 WHERE sample.public OR sample.public_after_date < date'%s'
f15e63d204b1 Uploaded
greg
parents:
diff changeset
78 ORDER BY affy_id;""" % today
f15e63d204b1 Uploaded
greg
parents:
diff changeset
79 # Instantiate the cursor.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
80 cur = self.conn.cursor()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
81 # Execute the query.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
82 cur.execute(cmd)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
83 rows = cur.fetchall()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
84 for tup in rows:
f15e63d204b1 Uploaded
greg
parents:
diff changeset
85 values = self.extract_values(tup)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
86 # Output the row.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
87 self.outfh.write("%s\n" % "\t".join(values))
f15e63d204b1 Uploaded
greg
parents:
diff changeset
88
f15e63d204b1 Uploaded
greg
parents:
diff changeset
89 def extract_values(self, tup):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
90 values = []
f15e63d204b1 Uploaded
greg
parents:
diff changeset
91 # Extract the items from the tuple.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
92 affy_id = self.get_value(tup[0])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
93 sample_id = self.get_value(tup[1])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
94 colony_location = self.get_value(tup[6])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
95 collection_date = self.get_value(tup[8])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
96 if len(collection_date) > 0:
f15e63d204b1 Uploaded
greg
parents:
diff changeset
97 collection_date = collection_date[:10]
f15e63d204b1 Uploaded
greg
parents:
diff changeset
98 user_specimen_id = self.get_value(tup[9])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
99 registry_id = self.get_value(tup[10])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
100 sample_depth = self.get_value(tup[11])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
101 dna_extraction_method = self.get_value(tup[12])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
102 dna_concentration = self.get_value(tup[13])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
103 percent_missing_data_coral = self.get_value(tup[14])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
104 percent_acerv_coral = self.get_value(tup[15])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
105 percent_apalm_coral = self.get_value(tup[16])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
106 percent_heterozygous_coral = self.get_value(tup[17])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
107 field_call = self.get_value(tup[18])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
108 bcoral_genet_id = self.get_value(tup[19])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
109 coral_mlg_clonal_id = self.get_value(tup[20])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
110 coral_mlg_rep_sample_id = self.get_value(tup[21])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
111 genetic_coral_species_call = self.get_value(tup[22])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
112 spawning = self.get_value(tup[23])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
113 sperm_motility = self.get_value(tup[24])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
114 tle = self.get_value(tup[25])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
115 disease_resist = self.get_value(tup[26])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
116 bleach_resist = self.get_value(tup[27])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
117 mortality = self.get_value(tup[28])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
118 healing_time = self.get_value(tup[29])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
119 seq_facility = self.get_value(tup[30])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
120 array_version = self.get_value(tup[31])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
121 plate_barcode = self.get_value(tup[32])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
122 colony_latitude = self.get_value(tup[33])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
123 colony_longitude = self.get_value(tup[34])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
124 colony_depth = self.get_value(tup[35])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
125 reef_name = self.get_value(tup[36])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
126 region = self.get_value(tup[37])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
127 reef_latitude = self.get_value(tup[38])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
128 reef_longitude = self.get_value(tup[39])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
129 geographic_origin = self.get_value(tup[40])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
130 last_name = self.get_value(tup[41])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
131 first_name = self.get_value(tup[42])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
132 organization = self.get_value(tup[43])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
133 email = self.get_value(tup[44])
f15e63d204b1 Uploaded
greg
parents:
diff changeset
134 # Append the columns in the specified order.
f15e63d204b1 Uploaded
greg
parents:
diff changeset
135 values.append(affy_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
136 values.append(sample_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
137 values.append(user_specimen_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
138 values.append(field_call)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
139 values.append(sample_depth)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
140 values.append(percent_missing_data_coral)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
141 values.append(percent_heterozygous_coral)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
142 values.append(percent_acerv_coral)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
143 values.append(percent_apalm_coral)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
144 values.append(bcoral_genet_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
145 values.append(registry_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
146 values.append(dna_extraction_method)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
147 values.append(dna_concentration)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
148 values.append(colony_location)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
149 values.append(colony_latitude)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
150 values.append(colony_longitude)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
151 values.append(colony_depth)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
152 values.append(reef_name)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
153 values.append(region)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
154 values.append(reef_latitude)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
155 values.append(reef_longitude)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
156 values.append(geographic_origin)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
157 values.append(coral_mlg_clonal_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
158 values.append(coral_mlg_rep_sample_id)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
159 values.append(genetic_coral_species_call)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
160 values.append(spawning)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
161 values.append(sperm_motility)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
162 values.append(tle)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
163 values.append(disease_resist)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
164 values.append(bleach_resist)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
165 values.append(mortality)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
166 values.append(healing_time)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
167 values.append(seq_facility)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
168 values.append(array_version)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
169 values.append(plate_barcode)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
170 values.append(last_name)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
171 values.append(first_name)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
172 values.append(organization)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
173 values.append(email)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
174 values.append(collection_date)
f15e63d204b1 Uploaded
greg
parents:
diff changeset
175 return values
f15e63d204b1 Uploaded
greg
parents:
diff changeset
176
f15e63d204b1 Uploaded
greg
parents:
diff changeset
177 def get_value(self, loc):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
178 return str(loc) or ""
f15e63d204b1 Uploaded
greg
parents:
diff changeset
179
f15e63d204b1 Uploaded
greg
parents:
diff changeset
180 def parse_args(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
181 parser = argparse.ArgumentParser()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
182 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
f15e63d204b1 Uploaded
greg
parents:
diff changeset
183 parser.add_argument('--output', dest='output', help='Output dataset'),
f15e63d204b1 Uploaded
greg
parents:
diff changeset
184 self.args = parser.parse_args()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
185
f15e63d204b1 Uploaded
greg
parents:
diff changeset
186 def run(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
187 self.export_from_db()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
188
f15e63d204b1 Uploaded
greg
parents:
diff changeset
189 def shutdown(self):
f15e63d204b1 Uploaded
greg
parents:
diff changeset
190 self.outfh.flush()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
191 self.outfh.close()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
192 self.conn.close()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
193
f15e63d204b1 Uploaded
greg
parents:
diff changeset
194
f15e63d204b1 Uploaded
greg
parents:
diff changeset
195 if __name__ == '__main__':
f15e63d204b1 Uploaded
greg
parents:
diff changeset
196 easd = ExportAllSampleData()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
197 easd.run()
f15e63d204b1 Uploaded
greg
parents:
diff changeset
198 easd.shutdown()