Mercurial > repos > greg > ensure_synced
changeset 0:9180906544b6 draft
Uploaded
author | greg |
---|---|
date | Thu, 15 Aug 2019 10:37:49 -0400 |
parents | |
children | 281f38df3c58 |
files | .shed.yml ensure_synced.py ensure_synced.xml |
diffstat | 3 files changed, 157 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Thu Aug 15 10:37:49 2019 -0400 @@ -0,0 +1,14 @@ +name: ensure_synced +owner: greg +description: | + Compares a list of Affymetrix ids from a vcf file with those in a database table for equivalency. +homepage_url: http://baumslab.org +long_description: | + Compares the set of Affymetrix id strings contained within a selected VCF file with the set of Affymetrix + ids contained in the affy_id column of the sample table in the corals (stag) database for all samples not + in a failed state. If these sets are equivalent, the file and the database are considered to be in sync + with each other. +remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/corals/ensure_synced +type: unrestricted +categories: + - Micro-array Analysis
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ensure_synced.py Thu Aug 15 10:37:49 2019 -0400 @@ -0,0 +1,103 @@ +#!/usr/bin/env python +from __future__ import print_function + +import argparse +import psycopg2 +import sys + +from sqlalchemy import create_engine +from sqlalchemy import MetaData +from sqlalchemy.engine.url import make_url + +metadata = MetaData() + +SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] + + +class EnsureSynced(object): + def __init__(self): + self.args = None + self.conn = None + self.parse_args() + self.outfh = open(self.args.output, "w") + self.connect_db() + self.engine = create_engine(self.args.database_connection_string) + self.metadata = MetaData(self.engine) + self.affy_ids_from_db = [] + self.affy_ids_from_file = [] + + def connect_db(self): + url = make_url(self.args.database_connection_string) + args = url.translate_connect_args(username='user') + args.update(url.query) + assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' + self.conn = psycopg2.connect(**args) + + def get_affy_ids_from_db(self): + cmd = "SELECT affy_id FROM sample WHERE genotype_id NOT IN (SELECT id FROM genotype WHERE coral_mlg_clonal_id = 'failed') ORDER BY affy_id;" + cur = self.conn.cursor() + cur.execute(cmd) + rows = cur.fetchall() + for row in rows: + self.affy_ids_from_db.append(row[0]) + + def get_affy_ids_from_file(self, f): + with open(f) as fh: + for line in fh: + line = line.strip() + if line in SKIP_VALS: + # Skip the first 9 lines in the file. + continue + self.affy_ids_from_file.append(line) + self.affy_ids_from_file.sort() + + def get_difference(self, list1, list2): + if len(list1) > len(list2): + return list(set(list1) - set(list2)) + return list(set(list2) - set(list1)) + + def log(self, msg): + self.outfh.write("%s\n" % msg) + + def parse_args(self): + parser = argparse.ArgumentParser() + parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), + parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file') + parser.add_argument('--output', dest='output', help='Output dataset'), + self.args = parser.parse_args() + + def run(self): + self.get_affy_ids_from_db() + self.get_affy_ids_from_file(self.args.affy_ids_from_file) + if self.affy_ids_from_db == self.affy_ids_from_file: + in_sync = True + self.log("The selected file is in sync with the database.\n\n") + else: + in_sync = False + self.log("The selected file is not in sync with the database.\n\n") + num_affy_ids_from_db = len(self.affy_ids_from_db) + self.log("Number of Affymetrix ids in the database: %d\n" % num_affy_ids_from_db) + num_affy_ids_from_file = len(self.affy_ids_from_file) + self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file) + if not in_sync: + if num_affy_ids_from_db > num_affy_ids_from_file: + self.log("The database contains the following Affymetrix ids that are not in the file.\n") + else: + self.log("The file contains the following Affymetrix ids that are not in the database.\n") + diff_list = self.get_difference(self.affy_ids_from_db, self.affy_ids_from_file) + for affy_id in diff_list: + self.log("%s\n" % affy_id) + self.outfh.flush() + self.outfh.close() + sys.exit(1) + + def shutdown(self): + self.outfh.flush() + self.outfh.close() + self.conn.close() + + +if __name__ == '__main__': + es = EnsureSynced() + es.run() + es.shutdown()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ensure_synced.xml Thu Aug 15 10:37:49 2019 -0400 @@ -0,0 +1,40 @@ +<tool id="ensure_synced" name="Ensure synchronized" version="1.0.0"> + <description>analysis components</description> + <command detect_errors="exit_code"><![CDATA[ +#set affy_ids_from_file = 'affy_ids_from_file.txt' +grep "#CHROM" $input > test.head && +tr '\t' '\n' < test.head > $affy_ids_from_file +python '$__tool_directory__/ensure_synced.py' +--database_connection_string '$__app__.config.corals_database_connection' +--affy_ids_from_file '$affy_ids_from_file' +--output '$output']]></command> + <inputs> + <param name="input" format="vcf" type="data" label="All genotyped samples file"/> + </inputs> + <outputs> + <data name="output" format="txt" label="${tool.name} (process log) on ${on_string}"/> + </outputs> + <tests> + <test> + <!--Testing this tool is a bit difficult at the current time.--> + </test> + </tests> + <help> +**What it does** + +Compares the set of Affymetrix id strings contained within a selected VCF file with the set of Affymetrix ids contained +in the affy_id column of the sample table in the corals (stag) database for all samples not in a failed state. If these +sets are equivalent, the file and the database are considered to be in sync with each other. + </help> + <citations> + <citation type="bibtex"> + @misc{None, + journal = {None}, + author = {Baums I}, + title = {Manuscript in preparation}, + year = {None}, + url = {http://baumslab.org} + </citation> + </citations> +</tool> +