# HG changeset patch # User greg # Date 1496940503 14400 # Node ID f8603464bea77a520fde511bc350e4dac3ccb011 Uploaded diff -r 000000000000 -r f8603464bea7 .shed.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,15 @@ +name: plant_tribes_gene_family_classifier +owner: greg +description: | + Contains a tool that classifies gene sequences into precomputed orthologous gene family clusters using either + blastp (faster), HMMScan (slower but more sensitive to remote homologs) or both (more exhaustive). +homepage_url: https://github.com/dePamphilis/PlantTribes +long_description: | + Contains a tool that tool is one of the PlantTribes collection of automated modular analysis pipelines that + utilize objective classifications of complete protein sequences from sequenced plant genomes to perform + comparative evolutionary studies. This tool classifies gene sequences into precomputed orthologous gene family + clusters using either blastp (faster), HMMScan (slower but more sensitive to remote homologs) or both (more exhaustive). +remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/plant_tribes/gene_family_classifier +type: unrestricted +categories: +- Phylogenetics diff -r 000000000000 -r f8603464bea7 gene_family_classifier.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_family_classifier.py Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,95 @@ +#!/usr/bin/env python +import argparse +import os +import shutil + +import utils + + +BUFF_SIZE = 1048576 +OUTPUT_DIR = 'geneFamilyClassification_dir' + +parser = argparse.ArgumentParser() +parser.add_argument('--input', dest='input', help='Input dataset') +parser.add_argument('--scaffold', dest='scaffold', help='Orthogroups or gene families proteins scaffold') +parser.add_argument('--method', dest='method', help='Protein clustering method') +parser.add_argument('--classifier', dest='classifier', help='Protein classification method') +parser.add_argument('--config_dir', dest='config_dir', help='Directory containing default configuration files') +parser.add_argument('--num_threads', dest='num_threads', type=int, help='Number of threads to use for execution') +parser.add_argument('--super_orthogroups', dest='super_orthogroups', default=None, help='Super orthogroups clustering specification') +parser.add_argument('--single_copy_custom', dest='single_copy_custom', default=None, help='Custom single copy orthogroup configuration') +parser.add_argument('--single_copy_taxa', dest='single_copy_taxa', type=int, default=0, help='Minimum single copy taxa required in orthogroup') +parser.add_argument('--taxa_present', dest='taxa_present', type=int, default=0, help='Minimum taxa required in single copy orthogroup') +parser.add_argument('--orthogroup_fasta', dest='orthogroup_fasta', default=None, help='Flag to create orthogroup sequences') +parser.add_argument('--coding_sequences', dest='coding_sequences', default=None, help='Flag to create orthogroup coding sequences') +parser.add_argument('--save_hmmscan_log', dest='save_hmmscan_log', default=None, help='Flag to save the hmmscan log') +parser.add_argument('--hmmscan_log', dest='hmmscan_log', default=None, help='hmmscan log file') +parser.add_argument('--output_ptortho', dest='output_ptortho', default=None, help='Output for orthogroups') +parser.add_argument('--output_ptortho_dir', dest='output_ptortho_dir', default=None, help='output_ptortho.files_path') +parser.add_argument('--output_ptorthocs', dest='output_ptorthocs', default=None, help='Output for orthogroups with corresponding coding sequences') +parser.add_argument('--output_ptorthocs_dir', dest='output_ptorthocs_dir', default=None, help='output_ptorthocs.files_path') +parser.add_argument('--output_ptsco', dest='output_ptsco', default=None, help='Output for single copy orthogroups') +parser.add_argument('--output_ptsco_dir', dest='output_ptsco_dir', default=None, help='output_ptsco.files_path') + +args = parser.parse_args() + +# Build the command line. +cmd = 'GeneFamilyClassifier' +cmd += ' --proteins %s' % args.input +cmd += ' --scaffold %s' % args.scaffold +cmd += ' --method %s' % args.method +cmd += ' --classifier %s' % args.classifier +cmd += ' --config_dir %s' % args.config_dir +cmd += ' --num_threads %d' % args.num_threads +if args.super_orthogroups is not None: + cmd += ' --super_orthogroups %s' % args.super_orthogroups +if args.single_copy_custom is not None: + cmd += ' --single_copy_custom %s' % args.single_copy_custom +if args.single_copy_taxa > 0: + cmd += ' --single_copy_taxa %d' % args.single_copy_taxa +if args.taxa_present > 0: + cmd += ' --taxa_present %d' % args.taxa_present +if args.orthogroup_fasta is None: + create_ortho_sequences = False +else: + create_ortho_sequences = True + cmd += ' --orthogroup_fasta' +if args.coding_sequences is None: + create_corresponding_coding_sequences = False +else: + create_corresponding_coding_sequences = True + cmd += ' --coding_sequences %s' % args.coding_sequences + +# Run the command. +utils.run_command(cmd) + +# Handle hmmscan.log output. +if args.classifier in ['hmmscan', 'both']: + src_hmmscan_log = os.path.join(OUTPUT_DIR, 'hmmscan.log') + if os.path.exists(src_hmmscan_log): + if args.save_hmmscan_log is None: + os.remove(src_hmmscan_log) + else: + shutil.move(src_hmmscan_log, args.hmmscan_log) + +# Handle orthogroups outputs. +if create_ortho_sequences: + if create_corresponding_coding_sequences: + out_file = args.output_ptorthocs + orthogroups_fasta_dest_dir = args.output_ptorthocs_dir + title = 'Orthogroups and corresponding coding sequences files' + else: + out_file = args.output_ptortho + orthogroups_fasta_dest_dir = args.output_ptortho_dir + title = 'Orthogroups files' + orthogroups_fasta_src_dir = os.path.join(OUTPUT_DIR, 'orthogroups_fasta') + utils.move_directory_files(orthogroups_fasta_src_dir, orthogroups_fasta_dest_dir) + utils.write_html_output(out_file, title, orthogroups_fasta_dest_dir) + +# Handle single copy orthogroup outputs. +if args.output_ptsco is not None: + single_copy_fasta_src_dir = os.path.join(OUTPUT_DIR, 'single_copy_fasta') + single_copy_fasta_dest_dir = args.output_ptsco_dir + title = 'Single copy orthogroups files' + utils.move_directory_files(single_copy_fasta_src_dir, single_copy_fasta_dest_dir) + utils.write_html_output(args.output_ptsco, title, single_copy_fasta_dest_dir) diff -r 000000000000 -r f8603464bea7 gene_family_classifier.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_family_classifier.xml Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,366 @@ + + classifies gene sequences into pre-computed orthologous gene family clusters + + macros.xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + save_hmmscan_log_cond['classifier'] in ['hmmscan', 'both'] and save_hmmscan_log_cond['save_hmmscan_log'] == 'yes' + + + options_type['options_type_selector'] == 'advanced' and options_type['create_orthogroup_cond']['create_orthogroup'] == 'yes' and options_type['create_orthogroup_cond']['create_corresponding_coding_sequences_cond']['create_corresponding_coding_sequences'] == 'no' + + + options_type['options_type_selector'] == 'advanced' and options_type['create_orthogroup_cond']['create_orthogroup'] == 'yes' and options_type['create_orthogroup_cond']['create_corresponding_coding_sequences_cond']['create_corresponding_coding_sequences'] == 'yes' + + + options_type['options_type_selector'] == 'advanced' and options_type['create_orthogroup_cond']['create_orthogroup'] == 'yes' and options_type['specify_single_copy_cond']['specify_single_copy'] == 'yes' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool is one of the PlantTribes collection of automated modular analysis pipelines for comparative and evolutionary +analyses of genome-scale gene families and transcriptomes. This tool classifies gene coding sequences either produced by +the AssemblyPostProcessor tool or from an external source into pre-computed orthologous gene family clusters (orthogroups) +of a PlantTribes scaffold. Classified sequences are then assigned with the corresponding orthogroups’ metadata that includes +gene counts of backbone taxa, super clusters (super orthogoups) at multiple stringencies, and functional annotations from +sources such as Gene Ontology (GO), InterPro protein domains, TAIR, UniProtKB/TrEMBL, and UniProtKB/Swiss-Prot. Additionally, +sequences belonging to single/low-copy gene families that are mainly utilized in species tree inference can be determined. + +----- + +**Required options** + + * **Proteins fasta file** - proteins fasta file either produced by the AssemblyPostProcessor tool or an external source selected from your history. + * **Gene family scaffold** - one of the PlantTribes gene family scaffolds [2-4] installed into Galaxy by the PlantTribes Scaffold Data Manager tool. + * **Protein clustering method** - gene family scaffold protein clustering method as described in the AssemblyPostProcessor tool. + * **Protein classifier** - classifier to assign protein sequences into a specified scaffold orthogroups. PlantTribes implements three classification approaches; blastp (faster)[5], hmmscan (slower but more sensitive assignment of divergent homologs)[6], and both blastp and hmmscan (disagreements resolved in favor of hmmscan; more exhaustive). + +**Other options** + + * **Super orthogroups configuration** - select ‘Yes’ to enable super orthogroups configuration options. Super orthogroups[7] are constructed through a second iteration of MCL clustering to connect distant, but potentially related orthogroup clusters. + + * **Clustering distance measure** - distance measure used in merging orthogroup clusters into super orthogroup clusters. PlantTribes pre-computed super orthogroups are based on the minimum and average blastp e-value between all pairs of scaffold orthogroups used as the input matrix for MCL clustering algorithm[8]. + + * **Single copy orthogroups configuration** - select ‘Yes’ to enable single/low-copy orthogroups selection configuration options. + + * **Selection criterion** - single/low-copy orthogroups selection criterion. PlantTribes provides custom and global selection criteria for selecting user-defined single/low-copy scaffold orthogoups. + + * **Global selection configuration** - the upper limit values of the following two parameters vary depending on the selected gene family scaffold, and the tool will produce an error if the value exceeds the number of species in the circumscribed scaffold. + + * **Minimum single copy taxa** - minimum number of taxa with single copy genes in the orthogroup. + * **Minimum taxa present** - minimum number of taxa present in the orthogroup. + + * **Custom selection configuration** - select ‘Yes’ to enable selection of a single copy configuration file. Scaffold configuration templates (.singleCopy.config) of how to customize single/low-copy orthogroups selection can be found in the scaffold data installed into Galaxy via the PlantTribes Scaffolds Download Data Manager tool, and also available at the PlantTribes GitHub `repository`_. Single/low-copy settings shown in these templates are used as defaults if ‘No’ is selected. + + * **Custom selection file** - select a single/low-copy customized configuration file from your history. + + * **Orthogroups fasta configuration** - select ‘Yes’ to create proteins orthogroups fasta files for the classified sequences. + + * **Orthogroups coding sequences** - select ‘Yes’ to create corresponding coding sequences orthogroup fasta files for the classified protein sequences. Requires coding sequences fasta file corresponding to the proteins fasta file to be selected from your history. + + * **Coding sequences fasta file** - select coding sequences fasta file corresponding to the proteins fasta file from your history. + +.. _repository: https://github.com/dePamphilis/PlantTribes/tree/master/config + + + + + + @article{Sasidharan2012, + journal = {Nucleic Acids Research}, + author = {2. Sasidharan R, Nepusz T, Swarbreck D, Huala E, Paccanaro A}, + title = {GFam: a platform for automatic annotation of gene families}, + year = {2012}, + pages = {gks631},} + + + @article{Li2003, + journal = {Genome Research} + author = {3. Li L, Stoeckert CJ, Roos DS}, + title = {OrthoMCL: identification of ortholog groups for eukaryotic genomes}, + year = {2003}, + volume = {13}, + number = {9}, + pages = {2178-2189},} + + + @article{Emms2015, + journal = {Genome Biology} + author = {4. Emms DM, Kelly S}, + title = {OrthoFinder: solving fundamental biases in whole genome comparisons dramatically improves orthogroup inference accuracy}, + year = {2015}, + volume = {16}, + number = {1}, + pages = {157},} + + + @article{Altschul1990, + journal = {Journal of molecular biology} + author = {5. Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ}, + title = {Basic local alignment search tool}, + year = {1990}, + volume = {215}, + number = {3}, + pages = {403-410},} + + + @article{Eddy2009, + journal = {Genome Inform}, + author = {6. Eddy SR}, + title = {A new generation of homology search tools based on probabilistic inference}, + year = {2009}, + volume = {23}, + number = {1}, + pages = {205-211},} + + + @article{Wall2008, + journal = {Nucleic Acids Research}, + author = {7. Wall PK, Leebens-Mack J, Muller KF, Field D, Altman NS}, + title = {PlantTribes: a gene and gene family resource for comparative genomics in plants}, + year = {2008}, + volume = {36}, + number = {suppl 1}, + pages = {D970-D976},} + + + @article{Enright2002, + journal = {Nucleic acids research}, + author = {8. Enright AJ, Van Dongen S, Ouzounis CA}, + title = {n efficient algorithm for large-scale detection of protein families}, + year = {2002}, + volume = {30}, + number = {7}, + pages = {1575-1584},} + + + diff -r 000000000000 -r f8603464bea7 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,130 @@ + + + 1.0 + + + plant_tribes_assembly_post_processor + + + + + plant_tribes_gene_family_aligner + + + + + plant_tribes_gene_family_classifier + + + + + plant_tribes_gene_family_integrator + + + + + plant_tribes_kaks_analysis + + + + + r-optparse + + + + + plant_tribes_gene_family_phylogeny_builder + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @misc{None, + journal = {None}, + author = {1. Wafula EK}, + title = {Manuscript in preparation}, + year = {None}, + url = {https://github.com/dePamphilis/PlantTribes},} + + + diff -r 000000000000 -r f8603464bea7 plant_tribes_scaffolds.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plant_tribes_scaffolds.loc Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,3 @@ +## Plant Tribes scaffolds +#Value Name Path Description +22Gv1.1 22Gv1.1 ${__HERE__}/test-data/tool-data/plant_tribes/scaffolds/22Gv1.1 22 plant genomes (Angiosperms clusters, version 1.1; 22Gv1.1) diff -r 000000000000 -r f8603464bea7 plant_tribes_scaffolds.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plant_tribes_scaffolds.loc.sample Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,4 @@ +## Plant Tribes scaffolds +#Value Name Path Description +#22Gv1.0 22Gv1.0 /plant_tribes/scaffolds/22Gv1.0 22 plant genomes (Angiosperms clusters, version 1.0; 22Gv1.0) +#22Gv1.1 22Gv1.1 /plant_tribes/scaffolds/22Gv1.1 22 plant genomes (Angiosperms clusters, version 1.1; 22Gv1.1) diff -r 000000000000 -r f8603464bea7 test-data/output.ptorthocs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.ptorthocs Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,28 @@ +

Orthogroups and corresponding coding sequences files: 12 items

+

+Datasets + + + + + + + + + + + + +
20.faa +
20.fna +
3494.faa +
3494.fna +
3722.faa +
3722.fna +
38889.faa +
38889.fna +
39614.faa +
39614.fna +
5235.faa +
5235.fna +
diff -r 000000000000 -r f8603464bea7 test-data/proteins.blastp.22Gv1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/proteins.blastp.22Gv1.1 Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,50 @@ +contig_1 gnl|Soltu3.4|PGSC0003DMP400044471 92.17 115 9 0 2 116 15 129 1e-73 231 +contig_1 gnl|Mimgu1.0|PACid:17670850 93.04 115 8 0 2 116 307 421 3e-72 234 +contig_1 gnl|Soltu3.4|PGSC0003DMP400044470 92.17 115 9 0 2 116 313 427 1e-71 233 +contig_1 gnl|Soltu3.4|PGSC0003DMP400044472 92.17 115 9 0 2 116 313 427 1e-71 233 +contig_1 gnl|Solly2.3|Solyc04g083010.2.1 92.17 115 9 0 2 116 313 427 1e-71 232 +contig_1 gnl|Nelnu1.0|NNU_016098-RA 90.43 115 11 0 2 116 315 429 5e-71 230 +contig_1 gnl|Poptr2.2|PACid:18246737 91.30 115 10 0 2 116 301 415 6e-71 230 +contig_1 gnl|Glyma1.01|PACid:16244092 90.43 115 11 0 2 116 298 412 2e-70 229 +contig_1 gnl|Glyma1.01|PACid:16244091 90.43 115 11 0 2 116 298 412 2e-70 229 +contig_1 gnl|Carpa1.181|PACid:16417175 90.43 115 11 0 2 116 289 403 2e-70 229 +contig_1 gnl|Vitvi12X|PACid:17826505 90.43 115 11 0 2 116 229 343 2e-70 228 +contig_1 gnl|Theca1.0|Tc10_g016990 90.43 115 11 0 2 116 288 402 7e-70 227 +contig_1 gnl|Phoda3.0|PDK_30s1127391g001 88.70 115 13 0 2 116 146 260 1e-69 224 +contig_1 gnl|Glyma1.01|PACid:16251026 87.83 115 14 0 2 116 127 241 1e-69 223 +contig_1 gnl|Orysa6.0|PACid:16860403 86.96 115 15 0 2 116 325 439 6e-69 221 +contig_1 gnl|Thepa2.0|Tp5g34670 89.57 115 12 0 2 116 310 424 1e-68 224 +contig_1 gnl|Aquco1.0|PACid:18145344 88.70 115 13 0 2 116 320 434 1e-68 224 +contig_1 gnl|Bradi1.2|Bradi2g30567.2 88.70 115 13 0 2 116 300 414 1e-68 224 +contig_1 gnl|Arath10|AT1G79600.1 89.57 115 12 0 2 116 308 422 2e-68 224 +contig_1 gnl|Glyma1.01|PACid:16251025 87.83 115 14 0 2 116 361 475 6e-68 223 +contig_1 gnl|Medtr3.5|Medtr4g026450.1 86.09 115 16 0 2 116 313 427 6e-68 223 +contig_1 gnl|Glyma1.01|PACid:16245030 86.96 115 15 0 2 116 326 440 1e-67 222 +contig_1 gnl|Glyma1.01|PACid:16245029 86.96 115 15 0 2 116 326 440 1e-67 222 +contig_1 gnl|Orysa6.0|PACid:16860404 86.96 115 15 0 2 116 325 439 2e-67 221 +contig_1 gnl|Sorbi1.4|PACid:1980340 86.09 115 16 0 2 116 312 426 2e-67 221 +contig_1 gnl|Ambtr1.0.27|AmTr_v1.0_scaffold00022.11 85.22 115 17 0 2 116 310 424 7e-67 219 +contig_1 gnl|Frave2.0|gene29299 89.57 115 12 0 2 116 711 825 2e-66 221 +contig_1 gnl|Musac1.0|GSMUA_Achr10T01800_001 82.61 115 20 0 2 116 229 343 4e-66 216 +contig_1 gnl|Musac1.0|GSMUA_Achr8T21380_001 81.74 115 21 0 2 116 229 343 6e-65 210 +contig_1 gnl|Phypa1.6|PACid:18072969 81.74 115 21 0 2 116 498 612 1e-62 209 +contig_1 gnl|Selmo1.0|PACid:15405864 78.07 114 25 0 3 116 228 341 4e-59 197 +contig_1 gnl|Poptr2.2|PACid:18214805 54.70 117 49 2 4 116 279 395 3e-36 135 +contig_1 gnl|Selmo1.0|PACid:15417058 56.60 106 46 0 11 116 216 321 3e-36 134 +contig_1 gnl|Glyma1.01|PACid:16255045 51.85 108 52 0 9 116 307 414 9e-36 134 +contig_1 gnl|Vitvi12X|PACid:17841082 51.79 112 54 0 5 116 337 448 1e-35 134 +contig_1 gnl|Aquco1.0|PACid:18159073 53.57 112 51 1 6 116 165 276 2e-35 131 +contig_1 gnl|Nelnu1.0|NNU_020249-RA 53.77 106 49 0 11 116 45 150 3e-35 130 +contig_1 gnl|Solly2.3|Solyc08g068920.2.1 53.77 106 49 0 11 116 323 428 5e-35 132 +contig_1 gnl|Medtr3.5|Medtr3g105760.1 51.85 108 52 0 9 116 309 416 5e-35 132 +contig_1 gnl|Phypa1.6|PACid:18069401 50.43 117 56 1 2 116 331 447 1e-34 131 +contig_1 gnl|Aquco1.0|PACid:18141086 53.91 115 45 1 2 116 17 123 3e-34 127 +contig_1 gnl|Arath10|AT5G24970.2 50.93 108 53 0 9 116 363 470 3e-34 130 +contig_1 gnl|Thepa2.0|Tp2g22500 51.85 108 52 0 9 116 326 433 3e-34 129 +contig_1 gnl|Mimgu1.0|PACid:17681633 52.83 106 50 0 11 116 191 296 3e-34 128 +contig_1 gnl|Ambtr1.0.27|AmTr_v1.0_scaffold00019.389 54.29 105 48 0 11 115 330 434 4e-34 129 +contig_1 gnl|Musac1.0|GSMUA_Achr1T23540_001 51.38 109 53 0 8 116 313 421 5e-34 129 +contig_1 gnl|Phypa1.6|PACid:18063964 53.04 115 46 1 2 116 312 418 6e-34 129 +contig_1 gnl|Phypa1.6|PACid:18051230 53.04 115 46 1 2 116 312 418 1e-33 128 +contig_1 gnl|Bradi1.2|Bradi1g28540.1 47.75 111 58 0 6 116 299 409 2e-33 127 +contig_1 gnl|Theca1.0|Tc09_g001090 53.77 106 49 0 11 116 332 437 2e-33 127 diff -r 000000000000 -r f8603464bea7 test-data/proteins.blastp.22Gv1.1.bestOrthos --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/proteins.blastp.22Gv1.1.bestOrthos Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,7 @@ +Gene ID Orthogroup ID +contig_10 3494 +contig_7 3722 +contig_3 554 +contig_2 38889 +contig_1 5235 +contig_9 20 diff -r 000000000000 -r f8603464bea7 test-data/proteins.both.22Gv1.1.bestOrthos --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/proteins.both.22Gv1.1.bestOrthos Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,7 @@ +Gene ID Orthogroup ID +contig_1 5235 +contig_10 3494 +contig_2 38889 +contig_3 39614 +contig_7 3722 +contig_9 20 diff -r 000000000000 -r f8603464bea7 test-data/proteins.both.22Gv1.1.bestOrthos.summary --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/proteins.both.22Gv1.1.bestOrthos.summary Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,4 @@ +Gene ID Orthogroup ID Arabidopsis thaliana Thellungiella parvula Carica papaya Theobroma cacao Populus trichocarpa Fragaria vesca Glycine max Medicago truncatula Vitis vinifera Solanum lycopersicum Solanum tuberosum Mimulus guttatus Nelumbo nucifera Aquilegia coerulea Oryza sativa Brachypodium distachyon Sorghum bicolor Musa acuminata Phoenix dactylifera Amborella trichopoda Selaginella moellendorffii Physcomitrella patens SuperOthogroup I1.2 SuperOthogroup I1.5 SuperOthogroup I1.8 SuperOthogroup I2.0 SuperOthogroup I2.5 SuperOthogroup I3.0 SuperOthogroup I3.5 SuperOthogroup I4.0 SuperOthogroup I4.5 SuperOthogroup I5.0 AHRD Descriptions TAIR Gene(s) Descriptions Pfam Domains InterProScan Descriptions GO Molecular Functions GO Biological Processes GO Cellular Components +contig_1 5235 1 1 1 1 1 1 6 1 1 1 3 1 1 1 2 1 1 2 1 1 1 1 2 207 198 330 347 338 461 452 558 557 Protein kinase superfamily protein [1.000] Protein kinase superfamily protein ABC1 (PF03109) [0.968] | APH (PF01636) [0.161] UbiB domain (IPR004147) [0.968] | Aminoglycoside phosphotransferase (IPR002575) [0.161] NULL / Representative annotation below 0.1% NULL / Representative annotation below 0.1% NULL / Representative annotation below 0.1% +contig_10 3494 1 1 1 1 3 1 4 1 0 1 1 1 3 1 3 5 3 3 4 1 0 0 96 177 236 239 529 538 532 524 780 779 RING/U-box superfamily protein [1.000] RING/U-box superfamily protein zf-C3HC4_2 (PF13923) [0.949] | zf-RING_5 (PF14634) [0.026] Zinc finger, RING-type (IPR001841) [0.026] protein binding (GO:0005515) [0.026] | zinc ion binding (GO:0008270) [0.026] NULL / Representative annotation below 0.1% NULL / Representative annotation below 0.1% +contig_2 38889 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 489 741 746 752 773 800 798 1128 1403 1407 6-phosphogluconolactonase 5 [0.500] | 6-phosphogluconolactonase 2 [0.500] Unkown protein(s) / No TAIR description(s) Glucosamine_iso (PF01182) [1.000] Glucosamine/galactosamine-6-phosphate isomerase (IPR006148) [1.000] NULL / Representative annotation below 0.1% carbohydrate metabolic process (GO:0005975) [1.000] NULL / Representative annotation below 0.1% diff -r 000000000000 -r f8603464bea7 test-data/proteins.hmmscan.22Gv1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/proteins.hmmscan.22Gv1.1 Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,25 @@ +# --- full sequence ---- --- best 1 domain ---- --- domain number estimation ---- +# target name accession query name accession E-value score bias E-value score bias exp reg clu ov env dom rep inc description of target +#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ ----- --- --- --- --- --- --- --- --- --------------------- +5235 - contig_1 - 9.1e-80 270.0 0.0 9.9e-80 269.9 0.0 1.0 1 0 0 1 1 1 1 - +10465 - contig_1 - 5e-45 155.1 0.0 5.8e-45 154.9 0.0 1.0 1 0 0 1 1 1 1 - +9351 - contig_1 - 4.7e-41 142.1 0.0 5.2e-41 142.0 0.0 1.0 1 0 0 1 1 1 1 - +20322 - contig_1 - 2e-35 123.0 0.0 2.3e-35 122.8 0.0 1.0 1 0 0 1 1 1 1 - +7049 - contig_1 - 3.9e-29 102.4 0.0 4.4e-29 102.3 0.0 1.0 1 0 0 1 1 1 1 - +5553 - contig_1 - 8.8e-29 101.7 0.0 1e-28 101.4 0.0 1.0 1 0 0 1 1 1 1 - +2922 - contig_1 - 7.3e-28 97.8 0.0 8.5e-28 97.6 0.0 1.0 1 0 0 1 1 1 1 - +8140 - contig_1 - 5e-26 92.3 0.2 1.6e-24 87.3 0.2 2.0 1 1 0 1 1 1 1 - +4060 - contig_1 - 1.6e-22 80.8 0.0 1.9e-22 80.6 0.0 1.0 1 0 0 1 1 1 1 - +8330 - contig_1 - 3.7e-17 63.4 0.0 4.2e-17 63.2 0.0 1.0 1 0 0 1 1 1 1 - +3825 - contig_1 - 4.3e-13 50.4 0.0 7e-13 49.7 0.0 1.3 1 1 0 1 1 1 1 - +4099 - contig_1 - 9.7e-13 48.1 0.0 1.1e-12 47.8 0.0 1.0 1 0 0 1 1 1 1 - +10051 - contig_1 - 1.1e-10 42.0 0.0 1.4e-10 41.7 0.0 1.0 1 0 0 1 1 1 1 - +8737 - contig_1 - 1.3e-10 41.9 0.0 1.6e-10 41.7 0.0 1.0 1 0 0 1 1 1 1 - +25529 - contig_1 - 2.1e-08 34.3 0.0 2.6e-08 34.0 0.0 1.0 1 0 0 1 1 1 1 - +40021 - contig_1 - 6.1e-08 32.9 0.0 6.7e-08 32.8 0.0 1.0 1 0 0 1 1 1 1 - +3494 - contig_10 - 1e-06 28.1 0.1 1.1e-06 28.0 0.1 1.0 1 0 0 1 1 1 1 - +38889 - contig_2 - 1.1e-87 295.2 0.2 1.4e-87 294.9 0.2 1.0 1 0 0 1 1 1 1 - +3534 - contig_2 - 1.6e-72 245.8 0.1 1.8e-72 245.7 0.1 1.0 1 0 0 1 1 1 1 - +4875 - contig_2 - 1.1e-62 213.8 0.0 1.3e-62 213.6 0.0 1.0 1 0 0 1 1 1 1 - +37475 - contig_2 - 6.1e-49 168.3 0.0 7.1e-49 168.1 0.0 1.0 1 0 0 1 1 1 1 - +28488 - contig_2 - 1.1e-45 157.4 0.0 1.4e-45 157.0 0.0 1.0 1 0 0 1 1 1 1 - diff -r 000000000000 -r f8603464bea7 test-data/proteins.hmmscan.22Gv1.1.bestOrthos --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/proteins.hmmscan.22Gv1.1.bestOrthos Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,7 @@ +Gene ID Orthogroup ID +contig_9 20 +contig_1 5235 +contig_2 38889 +contig_10 3494 +contig_7 3722 +contig_3 39614 diff -r 000000000000 -r f8603464bea7 test-data/tool-data/plant_tribes/scaffolds/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tool-data/plant_tribes/scaffolds/README.txt Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,3 @@ +For functional tests to work, this directory must contain symlinks to the scaffolds data +installed into the Galaxy instance to which planemo points via the --galaxy_root parameter. +This would typically be something like ~/galaxy/tool-data/plant_tribes/scaffolds/22Gv1.1. diff -r 000000000000 -r f8603464bea7 test-data/transcripts.cleaned.nr.cds --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/transcripts.cleaned.nr.cds Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,43 @@ +>contig_1 +NTTAAAAAATTATATGGAGACAAGGAAGATATCCTTGTCCCAGATATTTTCTGGGATTACACGAGTGGAAAGGTGCTAAC +AATGGAGTGGGTTGAAGGTGTTAAATTAAATGAGCAAGATGCCGTTGAGAGTCAAGGGCTCAGTGTTCTGGATCTGGTGA +ATACCGGCATACAGTGCAGTCTTCGACAGCTGCTTGAGTACGGCTATTTTCATGCAGATCCTCACCCAGGGAATCTCTTA +GCTACACCTGACGGGAAGCTTGCTTTTCTTGATTTTGGAATGATGAGTGAGACTCCTGAACAAGCAAGATCGGCCATAAT +TGGTCATGTTGTACACATGGTTAATCGN +>contig_10 +ATGGCAGAAGAGAACACCACTACAATGAACCTCGATCTCAATTTGGGCCCCATCAATAACTCAAGCGACGATAGCGAACC +TTCATCACGCCCTTATACTGATGTCGCAATGAACTTGGAAGATTGGTTAGATAGTCCCGTCCGAGTTCGTGAAGTCGTCC +GCCACAGAAATCATAGGTGGCGCTCTTTGTGGCGCCAAATCCCAATTCCGCCTGATACGCGAAACCTCGCGCTCGAATTA +ATCGGCGGCAATGCCCCN +>contig_2 +NNCCTTTCCAAGGTACCCATCCCATCCAACAACATATATGCTATAAATGATAAGAAGTCCCCGGAAGATGCAGCAGACGA +CTACGAAAACCGCCTCAAGGAACTCGTCTCCGAAAAAATCATACCCGTTTCAACCATTAGCGGGTTCCCGAAGTTCGACC +TCATGTTGCTTGGAATGGGGCCCGACGGCCATGTGGCCTCTCTTTTCCCTTCTCACATGCAACGCTATGAGAAGGAGAAA +TGGGTCACTTTCATAACTGACTCTCCCAAACCGCCTCCGTCGAGAATCACTTTTACGTTTCCGATGATCAACTCGGCTTC +GGAGATCGCTATGGTGGTTACCGGGGCTGATTTGGCTGGTACAACTAAGATAGCATTGGGTACTACGGGCAATGTTAAGC +CGGGTGAGACTCCTTTGCCTTGTACTGAAGTTTCGGCTGAGGGAGAGGTTACTTGGTTCTTGGACAAAGATGCTGCTTCA +CAACTGTTAAATTATGTGCGCTTTGATGAT +>contig_3 +NNTGTGGATGAAGGAGTTGTTGTTGCTGGCTTGTCAGAGCAGGAGAAGGCATCTGTTTCTGAAATTCTGACAACTGCTAG +AGCTCATTCAGAAACAATTGAGAACCTTAAGAGAGATCATTCCCAGCAGGTATCCTGTATCGAACAGCACACGAATGATA +CTTTCAGGCAAAAATACATGGATTACGAGCCTACAGGGTCCACGCCAGTTAGGAGCGAGCCGGATATTCCCAGCAAAGGC +ACAATAGAGTCACTTCGTGCCATGCCTATAGATGCACTTGAAGAAGAATTTCGAGAAAACCATTCATACGAATCTGCTGT +TACAGGAAAGGAACTAATGCCGTCTGTTACGACTCGTGCACCATTTTCACAGATCAAC +>contig_5 +NNCGGTGGTCCGCCACAAACACACGTCAAGCGGGATCCCGCATCCCGCGGGCTCTTCCACGCGGTCGTCCCGGCGCTCGG +CCCTCTCCGTGACGCACGTCGAGAGGGACGATTTGGCCGATGCCGCGTGATGCCAGGCCCCGACATCATCAAGGAGCACT +CCGTTGTGCCGACCCTCGCCTTCGATGACGTCCTCGGGCGTCTAGCGAAGTATCGAAGAAGGGCGAGCGGAGCCATGAAT +CCTGGAGATGCGAGCCAGGTCACGAGAGGCGCGGCAGGCGAGTCTTTGCTCGCTCTCGCACCGTCTGCTCTCGTGCTTGG +AGAAGAAGGACCGACTGCTGACGTCGAACCGGATGATGGAAGGTTCGAACAAGTCAGATCTGGAAAGGGGCATGGAAAAG +GCGACACTCTCACN +>contig_7 +GAGAATGAGTGGTCTGGGGCTGAGTTTTTGAATGAAATGGCGGCAATGATGACTCAAAATAAATCCAATGAAAACGGAAC +CGGAACTTTTGAAGAACTGCAACAATTGTTCGATGAAATGTTTCAGAGCGACATCGAGTCCTTCAATGGTTGTTCTTCAT +CATCCAATGAAACATGTAGCAACTCGAACAAGAGGAATTCCATTGAGTCGAGCTCGGCTAATTTCAGACCCGAAAATGGA +AACGAAAGCGGCGAGATTAGCGGGAAGAAGAATACTAGGAAAGGTAAAGGTGACGNN +>contig_9 +NNACTTCGGTTAAAGGCAGATGAGGAGGCACAATGTTTGAATCAGATGCAGCGTATCATTTTTGATGAAATTATGGAGCA +TGTGGAGTTAGAAAAGGGGGGCTTCTATTTCGTATATCGCCCTGGGGGCAACGGAAAGACCTTGTGGTTGGCTATTATCT +CAAAACTGAGAAGCGAGGGTAGAATAGTTCTCGCAGTGGCTTCATCAGGTATAGCATTGCTTTTGGTTGAGGGTGGTAGA +ACAGCCCATTCTCGATTTAAAATACCCATAGATGTCAATGAATATAACAATTGTGAAATTAAACAGAACATCTACCTCGC +TGAACTTATATGTCACACCAATTTGGTCATTTGGGATGAGGCACCTATGACTCAATATTTTGTCTTTGAGGCGGTTGAN diff -r 000000000000 -r f8603464bea7 test-data/transcripts.cleaned.nr.pep --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/transcripts.cleaned.nr.pep Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,22 @@ +>contig_1 +XKKLYGDKEDILVPDIFWDYTSGKVLTMEWVEGVKLNEQDAVESQGLSVLDLVNTGIQCSLRQLLEYGYFHADPHPGNLL +ATPDGKLAFLDFGMMSETPEQARSAIIGHVVHMVNR +>contig_10 +MAEENTTTMNLDLNLGPINNSSDDSEPSSRPYTDVAMNLEDWLDSPVRVREVVRHRNHRWRSLWRQIPIPPDTRNLALEL +IGGNAP +>contig_2 +XLSKVPIPSNNIYAINDKKSPEDAADDYENRLKELVSEKIIPVSTISGFPKFDLMLLGMGPDGHVASLFPSHMQRYEKEK +WVTFITDSPKPPPSRITFTFPMINSASEIAMVVTGADLAGTTKIALGTTGNVKPGETPLPCTEVSAEGEVTWFLDKDAAS +QLLNYVRFDD +>contig_3 +XVDEGVVVAGLSEQEKASVSEILTTARAHSETIENLKRDHSQQVSCIEQHTNDTFRQKYMDYEPTGSTPVRSEPDIPSKG +TIESLRAMPIDALEEEFRENHSYESAVTGKELMPSVTTRAPFSQIN +>contig_5 +XGGPPQTHVKRDPASRGLFHAVVPALGPLRDARREGRFGRCRVMPGPDIIKEHSVVPTLAFDDVLGRLAKYRRRASGAMN +PGDASQVTRGAAGESLLALAPSALVLGEEGPTADVEPDDGRFEQVRSGKGHGKGDTLT +>contig_7 +ENEWSGAEFLNEMAAMMTQNKSNENGTGTFEELQQLFDEMFQSDIESFNGCSSSSNETCSNSNKRNSIESSSANFRPENG +NESGEISGKKNTRKGKGDX +>contig_9 +XLRLKADEEAQCLNQMQRIIFDEIMEHVELEKGGFYFVYRPGGNGKTLWLAIISKLRSEGRIVLAVASSGIALLLVEGGR +TAHSRFKIPIDVNEYNNCEIKQNIYLAELICHTNLVIWDEAPMTQYFVFEAVX diff -r 000000000000 -r f8603464bea7 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,6 @@ + + + value, name, path, description + +
+
diff -r 000000000000 -r f8603464bea7 tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,6 @@ + + + value, name, path, description + +
+
diff -r 000000000000 -r f8603464bea7 utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils.py Thu Jun 08 12:48:23 2017 -0400 @@ -0,0 +1,80 @@ +import os +import shutil +import subprocess +import sys + +FSTDERR = 'stderr.txt' +FSTDOUT = 'stdout.txt' + + +def check_execution_errors(rc, fstderr, fstdout): + if rc != 0: + fh = open(fstdout, 'rb') + out_msg = fh.read() + fh.close() + fh = open(fstderr, 'rb') + err_msg = fh.read() + fh.close() + msg = '%s\n%s\n' % (str(out_msg), str(err_msg)) + stop_err(msg) + + +def get_response_buffers(): + fstderr = os.path.join(os.getcwd(), FSTDERR) + fherr = open(fstderr, 'wb') + fstdout = os.path.join(os.getcwd(), FSTDOUT) + fhout = open(fstdout, 'wb') + return fstderr, fherr, fstdout, fhout + + +def move_directory_files(source_dir, destination_dir, copy=False): + source_directory = os.path.abspath(source_dir) + destination_directory = os.path.abspath(destination_dir) + if not os.path.isdir(destination_directory): + os.makedirs(destination_directory) + for dir_entry in os.listdir(source_directory): + source_entry = os.path.join(source_directory, dir_entry) + if copy: + shutil.copy(source_entry, destination_directory) + else: + shutil.move(source_entry, destination_directory) + + +def run_command(cmd): + fstderr, fherr, fstdout, fhout = get_response_buffers() + proc = subprocess.Popen(args=cmd, stderr=fherr, stdout=fhout, shell=True) + rc = proc.wait() + # Check results. + fherr.close() + fhout.close() + check_execution_errors(rc, fstderr, fstdout) + + +def stop_err(msg): + sys.exit(msg) + + +def write_html_output(output, title, dir): + with open(output, 'w') as fh: + dir_items = sorted(os.listdir(dir)) + # Directories can only contain either files or directories, + # but not both. + if len(dir_items) > 0: + item_path = os.path.join(dir, dir_items[0]) + if os.path.isdir(item_path): + header = 'Directories' + else: + header = 'Datasets' + else: + header = '' + fh.write('

%s: %d items

\n' % (title, len(dir_items))) + fh.write('

\n') + fh.write('%s\n' % header) + for index, fname in enumerate(dir_items): + if index % 2 == 0: + bgcolor = '#D8D8D8' + else: + bgcolor = '#FFFFFF' + link = '%s\n' % (fname, fname) + fh.write('\n' % (bgcolor, link)) + fh.write('
%s
\n')