changeset 0:9180906544b6 draft

Uploaded
author greg
date Thu, 15 Aug 2019 10:37:49 -0400
parents
children 281f38df3c58
files .shed.yml ensure_synced.py ensure_synced.xml
diffstat 3 files changed, 157 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Thu Aug 15 10:37:49 2019 -0400
@@ -0,0 +1,14 @@
+name: ensure_synced
+owner: greg
+description: |
+  Compares a list of Affymetrix ids from a vcf file with those in a database table for equivalency.
+homepage_url: http://baumslab.org
+long_description: |
+  Compares the set of Affymetrix id strings contained within a selected VCF file with the set of Affymetrix
+  ids contained in the affy_id column of the sample table in the corals (stag) database for all samples not
+  in a failed state.  If these sets are equivalent, the file and the database are considered to be in sync
+  with each other.
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/corals/ensure_synced
+type: unrestricted
+categories:
+  - Micro-array Analysis
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ensure_synced.py	Thu Aug 15 10:37:49 2019 -0400
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import argparse
+import psycopg2
+import sys
+
+from sqlalchemy import create_engine
+from sqlalchemy import MetaData
+from sqlalchemy.engine.url import make_url
+
+metadata = MetaData()
+
+SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
+
+
+class EnsureSynced(object):
+    def __init__(self):
+        self.args = None
+        self.conn = None
+        self.parse_args()
+        self.outfh = open(self.args.output, "w")
+        self.connect_db()
+        self.engine = create_engine(self.args.database_connection_string)
+        self.metadata = MetaData(self.engine)
+        self.affy_ids_from_db = []
+        self.affy_ids_from_file = []
+
+    def connect_db(self):
+        url = make_url(self.args.database_connection_string)
+        args = url.translate_connect_args(username='user')
+        args.update(url.query)
+        assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
+        self.conn = psycopg2.connect(**args)
+
+    def get_affy_ids_from_db(self):
+        cmd = "SELECT affy_id FROM sample WHERE genotype_id NOT IN (SELECT id FROM genotype WHERE coral_mlg_clonal_id = 'failed') ORDER BY affy_id;"
+        cur = self.conn.cursor()
+        cur.execute(cmd)
+        rows = cur.fetchall()
+        for row in rows:
+            self.affy_ids_from_db.append(row[0])
+
+    def get_affy_ids_from_file(self, f):
+        with open(f) as fh:
+            for line in fh:
+                line = line.strip()
+                if line in SKIP_VALS:
+                    # Skip the first 9 lines in the file.
+                    continue
+                self.affy_ids_from_file.append(line)
+        self.affy_ids_from_file.sort()
+
+    def get_difference(self, list1, list2):
+        if len(list1) > len(list2):
+            return list(set(list1) - set(list2))
+        return list(set(list2) - set(list1))
+
+    def log(self, msg):
+        self.outfh.write("%s\n" % msg)
+
+    def parse_args(self):
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
+        parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file')
+        parser.add_argument('--output', dest='output', help='Output dataset'),
+        self.args = parser.parse_args()
+
+    def run(self):
+        self.get_affy_ids_from_db()
+        self.get_affy_ids_from_file(self.args.affy_ids_from_file)
+        if self.affy_ids_from_db == self.affy_ids_from_file:
+            in_sync = True
+            self.log("The selected file is in sync with the database.\n\n")
+        else:
+            in_sync = False
+            self.log("The selected file is not in sync with the database.\n\n")
+        num_affy_ids_from_db = len(self.affy_ids_from_db)
+        self.log("Number of Affymetrix ids in the database: %d\n" % num_affy_ids_from_db)
+        num_affy_ids_from_file = len(self.affy_ids_from_file)
+        self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file)
+        if not in_sync:
+            if num_affy_ids_from_db > num_affy_ids_from_file:
+                self.log("The database contains the following Affymetrix ids that are not in the file.\n")
+            else:
+                self.log("The file contains the following Affymetrix ids that are not in the database.\n")
+            diff_list = self.get_difference(self.affy_ids_from_db, self.affy_ids_from_file)
+            for affy_id in diff_list:
+                self.log("%s\n" % affy_id)
+            self.outfh.flush()
+            self.outfh.close()
+            sys.exit(1)
+
+    def shutdown(self):
+        self.outfh.flush()
+        self.outfh.close()
+        self.conn.close()
+
+
+if __name__ == '__main__':
+    es = EnsureSynced()
+    es.run()
+    es.shutdown()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ensure_synced.xml	Thu Aug 15 10:37:49 2019 -0400
@@ -0,0 +1,40 @@
+<tool id="ensure_synced" name="Ensure synchronized" version="1.0.0">
+    <description>analysis components</description>
+    <command detect_errors="exit_code"><![CDATA[
+#set affy_ids_from_file = 'affy_ids_from_file.txt'
+grep "#CHROM" $input > test.head &&
+tr '\t' '\n' < test.head > $affy_ids_from_file
+python '$__tool_directory__/ensure_synced.py'
+--database_connection_string '$__app__.config.corals_database_connection'
+--affy_ids_from_file '$affy_ids_from_file'
+--output '$output']]></command>
+    <inputs>
+        <param name="input" format="vcf" type="data" label="All genotyped samples file"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="txt" label="${tool.name} (process log) on ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <!--Testing this tool is a bit difficult at the current time.-->
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Compares the set of Affymetrix id strings contained within a selected VCF file with the set of Affymetrix ids contained
+in the affy_id column of the sample table in the corals (stag) database for all samples not in a failed state.  If these
+sets are equivalent, the file and the database are considered to be in sync with each other.
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{None,
+            journal = {None},
+            author = {Baums I},
+            title = {Manuscript in preparation},
+            year = {None},
+            url = {http://baumslab.org}
+        </citation>
+    </citations>
+</tool>
+