Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader
changeset 0:6282484a52bc draft
Uploaded
author | greg |
---|---|
date | Tue, 21 Aug 2018 12:57:00 -0400 |
parents | |
children | 794951c24c86 |
files | .shed.yml gene_family_scaffold_loader.py gene_family_scaffold_loader.xml macros.xml plant_tribes_scaffolds.loc plant_tribes_scaffolds.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 8 files changed, 499 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Tue Aug 21 12:57:00 2018 -0400 @@ -0,0 +1,13 @@ +name: plant_tribes_gene_family_scaffold_loader +owner: greg +description: | + Contains a tool that analyzes a PlantTribes scaffold and inserts information about it into the Galaxy PlantTribes database. +homepage_url: https://github.com/dePamphilis/PlantTribes +long_description: | + Contains a tool that analyzes a PlantTribes scaffold, installed into Galaxy via the PlantTribes Scaffolds Downloader + data manager tool, and inserts information about it into the Galaxy PlantTribes database for querying and additional + analysis. +remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/phylogenetics/plant_tribes/gene_family_scaffold_loader +type: unrestricted +categories: +- Phylogenetics
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_family_scaffold_loader.py Tue Aug 21 12:57:00 2018 -0400 @@ -0,0 +1,395 @@ +#!/usr/bin/env python +""" +add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes +database efficiently by bypassing the Galaxy model and operating directly on the database. +PostgreSQL 9.1 or greater is required. +""" +import argparse +import glob +import os +import sys + +import psycopg2 +from sqlalchemy.engine.url import make_url + + +class ScaffoldLoader(object): + def __init__(self): + self.args = None + self.clustering_methods = [] + self.conn = None + self.gene_sequences_dict = {} + self.scaffold_genes_dict = {} + self.scaffold_recs = [] + self.species_genes_dict = {} + self.species_ids_dict = {} + self.taxa_lineage_config = None + self.parse_args() + self.fh = open(self.args.output, "w") + self.connect_db() + + def parse_args(self): + parser = argparse.ArgumentParser() + parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), + parser.add_argument('--output', dest='output', help='Output dataset'), + parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') + self.args = parser.parse_args() + + def connect_db(self): + url = make_url(self.args.database_connection_string) + self.log('Connecting to database with URL: %s' % url) + args = url.translate_connect_args(username='user') + args.update(url.query) + assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' + self.conn = psycopg2.connect(**args) + + def flush(self): + self.conn.commit() + + def shutdown(self): + self.conn.close() + + def update(self, sql, args): + try: + cur = self.conn.cursor() + cur.execute(sql, args) + except Exception as e: + msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e) + self.stop_err(msg) + return cur + + def stop_err(self, msg): + sys.stderr.write(msg) + self.fh.flush() + self.fh.close() + sys.exit(1) + + def log(self, msg): + self.fh.write("%s\n" % msg) + self.fh.flush() + + @property + def can_add_scaffold(self): + """ + Make sure the scaffold has not already been added. + """ + scaffold_id = os.path.basename(self.args.scaffold_path) + sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id + cur = self.conn.cursor() + cur.execute(sql) + try: + cur.fetchone()[0] + # The scaffold has been added to the database. + return False + except: + # The scaffold has not yet been added. + return True + + def run(self): + if self.can_add_scaffold: + self.process_annot_dir() + self.process_scaffold_config_files() + self.process_orthogroup_fasta_files() + self.fh.flush() + self.fh.close() + else: + self.stop_err("The scaffold %s has already been added to the database." % os.path.basename(self.args.scaffold_path)) + + def process_annot_dir(self): + """ + 1. Parse all of the *.min_evalue.summary files in the + ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate + both the plant_tribes_scaffold and the plant_tribes_orthogroup tables. + 1. Parse all of the *.list files in the same directory to populate + self.scaffold_genes_dict. + """ + scaffold_id = os.path.basename(self.args.scaffold_path) + file_dir = os.path.join(self.args.scaffold_path, 'annot') + # The scaffol naming convention must follow this pattern: + # <integer1>Gv<integer2>.<integer3> + # where integer 1 is the number of genomes in the scaffold_id. For example: + # 22Gv1.1 -> 22 genomes + # 12Gv1.0 -> 12 genomes + # 26Gv2.0 -> 26 genomes, etc. + num_genomes = int(scaffold_id.split("Gv")[0]) + super_ortho_start_index = num_genomes + 1 + for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")): + items = os.path.basename(file_name).split(".") + clustering_method = items[0] + # Save all clustering methods for later processing. + if clustering_method not in self.clustering_methods: + self.clustering_methods.append(clustering_method) + # Insert a row in to the plant_tribes_scaffold table. + self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method)) + args = [scaffold_id, clustering_method] + sql = """ + INSERT INTO plant_tribes_scaffold + VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) + RETURNING id; + """ + cur = self.update(sql, tuple(args)) + self.flush() + scaffold_id_db = cur.fetchone()[0] + self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) + with open(file_name, "r") as fh: + i = 0 + for i2, line in enumerate(fh): + if i2 == 0: + # Skip first line. + continue + num_genes = 0 + num_species = 0 + items = line.split("\t") + orthogroup_id = int(items[0]) + # Zero based items 1 to num_genomes consists of the + # number of species classified in the orthogroup (i.e., + # species with at least 1 gene in the orthogroup). + for j in range(1, num_genomes): + j_int = int(items[j]) + if j_int > 0: + # The species has at least 1 gene + num_species += 1 + num_genes += j_int + # Insert a row into the plant_tribes_orthogroup table. + args = [orthogroup_id, scaffold_id_db, num_species, num_genes] + for k in range(super_ortho_start_index, len(items)): + args.append('%s' % str(items[k])) + sql = """ + INSERT INTO plant_tribes_orthogroup + VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); + """ + cur = self.update(sql, tuple(args)) + self.flush() + i += 1 + self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method)) + for file_name in glob.glob(os.path.join(file_dir, "*list")): + items = os.path.basename(file_name).split(".") + clustering_method = items[0] + with open(file_name, "r") as fh: + for i, line in enumerate(fh): + items = line.split("\t") + # The key will be a combination of clustering_method and + # orthogroup_id separated by "^^" for easy splitting later. + key = "%s^^%s" % (clustering_method, items[0]) + # The value is the gen_id with all white space replaced by "_". + val = items[1].replace("|", "_") + if key in self.scaffold_genes_dict: + self.scaffold_genes_dict[key].append(val) + else: + self.scaffold_genes_dict[key] = [val] + + def process_scaffold_config_files(self): + """ + 1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config + (e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate. + 2. Calculate the number of genes found + for each species and add the number to self.species_genes_dict. + 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to + populate the plant_tribes_taxon table. + """ + scaffold_id = os.path.basename(self.args.scaffold_path) + file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id) + self.log("Processing rooting order config: %s" % str(file_name)) + # Populate self.species_ids_dict. + with open(file_name, "r") as fh: + for i, line in enumerate(fh): + line = line.strip() + if len(line) == 0 or line.startswith("#") or line.startswith("["): + # Skip blank lines, comments and section headers. + continue + # Example line: + # Physcomitrella patens=Phypa + items = line.split("=") + self.species_ids_dict[items[1]] = items[0] + # Get lineage information for orthogrpoup taxa. + for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()): + # The format of key is <clustering_method>^^<orthogroup_id>. + # For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001" + scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^") + clustering_method = scaffold_genes_dict_key_items[0] + # Get the list of genes for the current scaffold_genes_dict_key. + gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key] + for gene_id in gene_list: + # Example species_code: Musac1.0, where + # species_name is Musac and version is 1.0. + species_code = gene_id.split("_")[1] + # Strip the version from the species_code. + species_code = species_code[0:5] + # Get the species_name from self.species_ids_dict. + species_name = self.species_ids_dict[species_code] + # Create a key for self.species_genes_dict, with the format: + # <clustering_method>^^<species_name> + species_genes_dict_key = "%s^^%s" % (clustering_method, species_name) + # Add an entry to self.species_genes_dict, where the value + # is a list containing species_name and num_genes. + if species_genes_dict_key in self.species_genes_dict: + tup = self.species_genes_dict[species_genes_dict_key] + tup[1] += 1 + self.species_genes_dict[species_genes_dict_key] = tup + else: + self.species_genes_dict[species_genes_dict_key] = [species_name, 1] + # Populate the plant_tribes_taxon table. + file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) + self.log("Processing taxa lineage config: %s" % str(file_name)) + with open(file_name, "r") as fh: + for line in fh: + line = line.strip() + if len(line) == 0 or line.startswith("#") or line.startswith("Species"): + # Skip blank lines, comments and section headers. + continue + # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots + items = line.split("\t") + species_name = items[0] + i = 0 + for clustering_method in self.clustering_methods: + # The format of species_genes_dict_key is <clustering_method>^^<species_name>. + species_genes_dict_key = "%s^^%s" % (clustering_method, species_name) + # Get the scaffold_rec for the current scaffold_id and clustering_method. + # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] + for scaffold_rec in self.scaffold_recs: + if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: + scaffold_id_db = scaffold_rec[0] + # The value is a list containing species_name and num_genes. + val = self.species_genes_dict[species_genes_dict_key] + if species_name == val[0]: + num_genes = val[1] + else: + num_genes = 0 + # Insert a row in to the plant_tribes_scaffold table. + args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]] + sql = """ + INSERT INTO plant_tribes_taxon + VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); + """ + self.update(sql, tuple(args)) + self.flush() + i += 1 + self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % (i, str(species_name))) + + def process_orthogroup_fasta_files(self): + """ + 1. Analyze all of the scaffold .fna and .faa files for each clustering + method to populate the aa_dict and dna_dict sequence dictionaries. + 2. Use the populated sequence dictionaries to populate the plant_tribes_gene + and gene_scaffold_orthogroup_taxon_association tables. + """ + scaffold_id = os.path.basename(self.args.scaffold_path) + aa_dict = {} + dna_dict = {} + # Populate aa_dict and dna_dict. + for clustering_method in self.clustering_methods: + file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method) + for file_name in os.listdir(file_dir): + items = file_name.split(".") + orthogroup_id = items[0] + file_extension = items[1] + if file_extension == "fna": + adict = dna_dict + else: + adict = aa_dict + file_path = os.path.join(file_dir, file_name) + with open(file_path, "r") as fh: + for i, line in enumerate(fh): + line = line.strip() + if len(line) == 0: + # Skip blank lines (shoudn't happen). + continue + if line.startswith(">"): + # Example line: + # >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110 + gene_id = line.lstrip(">") + # The dictionary keys will combine the orthogroup_id, + # clustering method and gene id using the format + # ,orthogroup_id>^^<clustering_method>^^<gene_id>. + combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id) + if combined_id not in adict: + # The value will be the dna sequence string.. + adict[combined_id] = "" + else: + # Example line: + # ATGGAGAAGGACTTT + # Here combined_id is set because the fasta format specifies + # that all lines following the gene id defined in the if block + # above will be the sequence associated with that gene until + # the next gene id line is encountered. + sequence = adict[combined_id] + sequence = "%s%s" % (sequence, line) + adict[combined_id] = sequence + # Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables + # from the contents of aa_dict and dna_dict. + self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.") + gi = 0 + for gsoai, combined_id in enumerate(sorted(dna_dict.keys())): + # The dictionary keys combine the orthogroup_id, clustering method and + # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. + items = combined_id.split("^^") + orthogroup_id = items[0] + clustering_method = items[1] + gene_id = items[2] + # The value will be a list containing both + # clustering_method and the dna string. + dna_sequence = dna_dict[combined_id] + aa_sequence = aa_dict[combined_id] + # Get the species_code from the gene_id. + species_code = gene_id.split("_")[1] + # Strip the version from the species_code. + species_code = species_code[0:5] + # Get the species_name from self.species_ids_dict. + species_name = self.species_ids_dict[species_code] + # Get the plant_tribes_orthogroup primary key id for + # the orthogroup_id from the plant_tribes_orthogroup table. + sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id + cur = self.conn.cursor() + cur.execute(sql) + orthogroup_id_db = cur.fetchone()[0] + # If the plant_tribes_gene table contains a row that has the gene_id, + # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table. + # Get the taxon_id for the species_name from the plant_tribes_taxon table. + sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name + cur = self.conn.cursor() + cur.execute(sql) + taxon_id_db = cur.fetchone()[0] + # If the plant_tribes_gene table contains a row that has the gene_id, + # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table. + sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id + cur = self.conn.cursor() + cur.execute(sql) + try: + gene_id_db = cur.fetchone()[0] + except: + # Insert a row into the plant_tribes_gene table. + args = [gene_id, dna_sequence, aa_sequence] + sql = """ + INSERT INTO plant_tribes_gene + VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s) + RETURNING id; + """ + cur = self.update(sql, tuple(args)) + self.flush() + gene_id_db = cur.fetchone()[0] + gi += 1 + if gi % 1000 == 0: + self.log("Inserted 1000 more rows into the plant_tribes_gene table.") + # Insert a row into the gene_scaffold_orthogroup_taxon_association table. + # Get the scaffold_rec for the current scaffold_id and clustering_method. + # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] + for scaffold_rec in self.scaffold_recs: + if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: + scaffold_id_db = scaffold_rec[0] + args = [gene_id_db, scaffold_id_db, orthogroup_id_db, taxon_id_db] + sql = """ + INSERT INTO gene_scaffold_orthogroup_taxon_association + VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s); + """ + cur = self.update(sql, tuple(args)) + self.flush() + if gsoai % 1000 == 0: + self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.") + self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi) + self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai) + + +if __name__ == '__main__': + scaffold_loader = ScaffoldLoader() + scaffold_loader.run() + scaffold_loader.shutdown()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_family_scaffold_loader.xml Tue Aug 21 12:57:00 2018 -0400 @@ -0,0 +1,45 @@ +<tool id="plant_tribes_gene_family_scaffold_loader" name="Load PlantTribes scaffold" version="@WRAPPER_VERSION@.0.0"> + <description>into Galaxy PlantTribes database</description> + <macros> + <import>macros.xml</import> + </macros> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/gene_family_scaffold_loader.py' +--database_connection_string '$__app__.config.plant_tribes_database_connection' +--output '$output' +--scaffold_path '$GALAXY_DATA_INDEX_DIR/plant_tribes/scaffolds/$scaffold' +--output '$output']]></command> + <inputs> + <expand macro="param_scaffold" /> + </inputs> + <outputs> + <data name="output" format="txt"/> + </outputs> + <tests> + <test> + <!--Testing this tool is a bit difficult at the current time.--> + </test> + </tests> + <help> +This tool is one of the PlantTribes collection of automated modular analysis pipelines for comparative and evolutionary +analyses of genome-scale gene families and transcriptomes. This tool analyzes scaffolds installed into Galaxy by the +PlantTribes Scaffolds Downloader data manager tool and inserts information about them into the Galaxy PlantTribes database +for querying and additional analysis. + +----- + +**Required options** + + * **Gene family scaffold** - one of the PlantTribes gene family scaffolds, installed into Galaxy by the PlantTribes Scaffold Data Manager tool, that has not yet been analyzed and loaded into the Galaxy PlantTribes database. + </help> + <citations> + <citation type="bibtex"> + @unpublished{None, + author = {Greg Von Kuster,Eric Wafula}, + title = {None}, + year = {None}, + eprint = {None}, + url = {https://github.com/dePamphilis/PlantTribes}} + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Aug 21 12:57:00 2018 -0400 @@ -0,0 +1,27 @@ +<?xml version='1.0' encoding='UTF-8'?> +<macros> + <token name="@WRAPPER_VERSION@">1.0</token> + <xml name="param_method"> + <param name="method" type="select" label="Protein clustering method"> + <option value="gfam" selected="true">GFam</option> + <option value="orthofinder">OrthoFinder</option> + <option value="orthomcl">OrthoMCL</option> + </param> + </xml> + <xml name="param_scaffold"> + <param name="scaffold" type="select" label="Gene family scaffold"> + <options from_data_table="plant_tribes_scaffolds" /> + <validator type="no_options" message="No PlantTribes scaffolds are available. Use the PlantTribes Scaffolds Download Data Manager tool in Galaxy to install and populate the PlantTribes scaffolds data table." /> + </param> + </xml> + <xml name="citation1"> + <citation type="bibtex"> + @misc{None, + journal = {None}, + author = {1. Wafula EK}, + title = {Manuscript in preparation}, + year = {None}, + url = {https://github.com/dePamphilis/PlantTribes},} + </citation> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plant_tribes_scaffolds.loc Tue Aug 21 12:57:00 2018 -0400 @@ -0,0 +1,3 @@ +## Plant Tribes scaffolds +#Value Name Path Description +22Gv1.1 22Gv1.1 ${__HERE__}/test-data/tool-data/plant_tribes/scaffolds/22Gv1.1 22 plant genomes (Angiosperms clusters, version 1.1; 22Gv1.1)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plant_tribes_scaffolds.loc.sample Tue Aug 21 12:57:00 2018 -0400 @@ -0,0 +1,4 @@ +## Plant Tribes scaffolds +#Value Name Path Description +#22Gv1.0 22Gv1.0 /plant_tribes/scaffolds/22Gv1.0 22 plant genomes (Angiosperms clusters, version 1.0; 22Gv1.0) +#22Gv1.1 22Gv1.1 /plant_tribes/scaffolds/22Gv1.1 22 plant genomes (Angiosperms clusters, version 1.1; 22Gv1.1)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Aug 21 12:57:00 2018 -0400 @@ -0,0 +1,6 @@ +<tables> + <table name="plant_tribes_scaffolds" comment_char="#"> + <columns>value, name, path, description</columns> + <file path="tool-data/plant_tribes_scaffolds.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Aug 21 12:57:00 2018 -0400 @@ -0,0 +1,6 @@ +<tables> + <table name="plant_tribes_scaffolds" comment_char="#"> + <columns>value, name, path, description</columns> + <file path="${__HERE__}/plant_tribes_scaffolds.loc" /> + </table> +</tables>