0
+ − 1 #!/usr/bin/env python
2
+ − 2
0
+ − 3 import argparse
+ − 4 import sys
+ − 5
3
+ − 6 import psycopg2
7
+ − 7 from sqlalchemy import create_engine
0
+ − 8 from sqlalchemy.engine.url import make_url
+ − 9
+ − 10 SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
+ − 11
+ − 12
+ − 13 class EnsureSynced(object):
+ − 14 def __init__(self):
+ − 15 self.args = None
+ − 16 self.conn = None
+ − 17 self.parse_args()
+ − 18 self.outfh = open(self.args.output, "w")
+ − 19 self.connect_db()
+ − 20 self.engine = create_engine(self.args.database_connection_string)
8
+ − 21 self.coral_mlg_rep_sample_ids_from_db = []
0
+ − 22 self.affy_ids_from_file = []
+ − 23
+ − 24 def connect_db(self):
+ − 25 url = make_url(self.args.database_connection_string)
+ − 26 args = url.translate_connect_args(username='user')
+ − 27 args.update(url.query)
+ − 28 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
+ − 29 self.conn = psycopg2.connect(**args)
+ − 30
8
+ − 31 def get_coral_mlg_rep_sample_ids_from_db(self):
7
+ − 32 cmd = "SELECT coral_mlg_rep_sample_id, coral_mlg_clonal_id FROM genotype WHERE coral_mlg_rep_sample_id IS NOT NULL AND coral_mlg_rep_sample_id != '' AND coral_mlg_clonal_id != 'failed' ORDER BY coral_mlg_rep_sample_id;"
0
+ − 33 cur = self.conn.cursor()
+ − 34 cur.execute(cmd)
+ − 35 rows = cur.fetchall()
+ − 36 for row in rows:
8
+ − 37 self.coral_mlg_rep_sample_ids_from_db.append(row[0])
+ − 38 self.coral_mlg_rep_sample_ids_from_db.sort()
0
+ − 39
+ − 40 def get_affy_ids_from_file(self, f):
+ − 41 with open(f) as fh:
+ − 42 for line in fh:
+ − 43 line = line.strip()
+ − 44 if line in SKIP_VALS:
+ − 45 # Skip the first 9 lines in the file.
+ − 46 continue
+ − 47 self.affy_ids_from_file.append(line)
+ − 48 self.affy_ids_from_file.sort()
+ − 49
+ − 50 def get_difference(self, list1, list2):
+ − 51 if len(list1) > len(list2):
+ − 52 return list(set(list1) - set(list2))
+ − 53 return list(set(list2) - set(list1))
+ − 54
+ − 55 def log(self, msg):
+ − 56 self.outfh.write("%s\n" % msg)
+ − 57
+ − 58 def parse_args(self):
+ − 59 parser = argparse.ArgumentParser()
+ − 60 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
+ − 61 parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file')
+ − 62 parser.add_argument('--output', dest='output', help='Output dataset'),
+ − 63 self.args = parser.parse_args()
+ − 64
+ − 65 def run(self):
8
+ − 66 self.get_coral_mlg_rep_sample_ids_from_db()
0
+ − 67 self.get_affy_ids_from_file(self.args.affy_ids_from_file)
8
+ − 68 if self.coral_mlg_rep_sample_ids_from_db == self.affy_ids_from_file:
0
+ − 69 in_sync = True
+ − 70 self.log("The selected file is in sync with the database.\n\n")
+ − 71 else:
+ − 72 in_sync = False
+ − 73 self.log("The selected file is not in sync with the database.\n\n")
8
+ − 74 num_coral_mlg_rep_sample_ids_from_db = len(self.coral_mlg_rep_sample_ids_from_db)
+ − 75 self.log("Number of coral mlg rep sample ids in the database: %d\n" % num_coral_mlg_rep_sample_ids_from_db)
0
+ − 76 num_affy_ids_from_file = len(self.affy_ids_from_file)
+ − 77 self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file)
+ − 78 if not in_sync:
8
+ − 79 if num_coral_mlg_rep_sample_ids_from_db > num_affy_ids_from_file:
0
+ − 80 self.log("The database contains the following Affymetrix ids that are not in the file.\n")
+ − 81 else:
+ − 82 self.log("The file contains the following Affymetrix ids that are not in the database.\n")
8
+ − 83 diff_list = self.get_difference(self.coral_mlg_rep_sample_ids_from_db, self.affy_ids_from_file)
0
+ − 84 for affy_id in diff_list:
+ − 85 self.log("%s\n" % affy_id)
+ − 86 self.outfh.flush()
+ − 87 self.outfh.close()
+ − 88 sys.exit(1)
+ − 89
+ − 90 def shutdown(self):
+ − 91 self.outfh.flush()
+ − 92 self.outfh.close()
+ − 93 self.conn.close()
+ − 94
+ − 95
+ − 96 if __name__ == '__main__':
+ − 97 es = EnsureSynced()
+ − 98 es.run()
+ − 99 es.shutdown()