# HG changeset patch # User rmarenco # Date 1471560600 14400 # Node ID 0ddb5ee32ff6ad340c548e066e5ce287cb1495a9 planemo upload for repository https://github.com/remimarenco/multi_fasta_glimmerhmm.git commit 28bd73b26b50165eded1d9ba995979acdf005ad1-dirty diff -r 000000000000 -r 0ddb5ee32ff6 glimmer_hmm.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmer_hmm.loc.sample Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,16 @@ +#This file lists the locations of all the trained_dir files +#under the "trained_dir" directory (a directory that contains a directory +#for each organism used by glimmer_hmm). +#This file has the format (white space characters are +#TAB characters): +# +# +# +#glimmer_hmm.loc could look something like this: +# +#human Human /path/to/trained_dir/human +#celegans Celegan /path/to/trained_dir/Celegans +#arabidopsis Arabidopsis /path/to/trained_dir/arabidopsis +#rice Rice /path/to/trained_dir/rice +#zebrafish Zebrafish /path/to/trained_dir/zebrafish +# diff -r 000000000000 -r 0ddb5ee32ff6 glimmerhmm.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glimmerhmm.xml Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,54 @@ + + Predict ORFs in eukaryotic genomes for Multi Fasta file + + + + + + + + + + + + + + + **What it does** + + GlimmerHMM is a new gene finder based on a Generalized Hidden Markov Model (GHMM). + Although the gene finder conforms to the overall mathematical framework of a GHMM, + additionally it incorporates splice site models adapted from the GeneSplicer program and a + decision tree adapted from GlimmerM. It also utilizes Interpolated Markov Models for the + coding and noncoding models . Currently, GlimmerHMM's GHMM structure includes introns of each phase, + intergenic regions, and four types of exons (initial, internal, final, and single). + A basic user manual can be consulted here. + + **Example** + + Suppose you have the following DNA formatted sequences:: + + >SQ Sequence 8667507 BP; 1203558 A; 3121252 C; 3129638 G; 1213059 T; 0 other; + cccgcggagcgggtaccacatcgctgcgcgatgtgcgagcgaacacccgggctgcgcccg + ggtgttgcgctcccgctccgcgggagcgctggcgggacgctgcgcgtcccgctcaccaag + cccgcttcgcgggcttggtgacgctccgtccgctgcgcttccggagttgcggggcttcgc + cccgctaaccctgggcctcgcttcgctccgccttgggcctgcggcgggtccgctgcgctc + ccccgcctcaagggcccttccggctgcgcctccaggacccaaccgcttgcgcgggcctgg + + Running this tool will produce this:: + + ##gff-version 3 + ##sequence-region ConsensusfromCH236920mapping 1 4148552 + ConsensusfromCH236920mapping GlimmerHMM mRNA 1 122 . + . ID=ConsensusfromCH236920mapping.path1.gene1;Name=ConsensusfromCH236920mapping.path1.gene1 + ConsensusfromCH236920mapping GlimmerHMM CDS 1 122 . + 0 ID=ConsensusfromCH236920mapping.cds1.1; + ConsensusfromCH236920mapping GlimmerHMM mRNA 14066 15205 . - . ID=ConsensusfromCH236920mapping.path1.gene2;Name=ConsensusfromCH236920mapping.path1.gene2 + ConsensusfromCH236920mapping GlimmerHMM CDS 14066 15034 . - 0 ID=ConsensusfromCH236920mapping.cds2.1; + ConsensusfromCH236920mapping GlimmerHMM CDS 15137 15205 . - 0 ID=ConsensusfromCH236920mapping.cds2.2; + ConsensusfromCH236920mapping GlimmerHMM mRNA 19910 24210 . - . ID=ConsensusfromCH236920mapping.path1.gene3;Name=ConsensusfromCH236920mapping.path1.gene3 + + diff -r 000000000000 -r 0ddb5ee32ff6 multi_glimmer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_glimmer.py Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,75 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +import argparse +import os +import subprocess +import sys + + +def main(): + parser = argparse.ArgumentParser(description='Get a multi-fasta, the trained_dir and the output file as inputs, ' + 'to generate GlimmerHMM gene prediction over all contigs') + + parser.add_argument('--multi_fasta', help='Multi fasta file to run GlimmerHMM on', required=True) + + parser.add_argument('--trained_dir', help='Path to the GlimmerHMM trained_dir', required=True) + + parser.add_argument('--output', help='file to output the result into', required=True) + + args = parser.parse_args() + + multi_fasta = args.multi_fasta + trained_dir = args.trained_dir + # TODO: Temporary fix for the issue with config.file in human/. Next: GC Content to select the appropriate folder + if trained_dir.split('/')[-1] == "human": + trained_dir = os.path.join(trained_dir, "Train0-43") + + output_file = args.output + temp_contig = "temp_contig" + + def exec_glimmer(contig_file, first_time=False): + p = subprocess.Popen(["glimmerhmm", contig_file, trained_dir, "-g"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, errors = p.communicate() + + p.wait() + # Process the error if != "Done" + if not errors or (errors.split()[0] != "Done"): + raise Exception("Error in glimmer: {0}".format(errors)) + else: + sys.stdout.write(errors) + # If not first time, we need to remove the first comments + if not first_time: + output = "\n".join(output.split("\n")[1:]) + + return output + + with open(output_file, 'w+') as o: + with open(multi_fasta, 'r') as mf: + is_first_time = True + for i, line in enumerate(mf): + if line[0] == '>': + # If it is the first time we finish to read a contig, we let glimmer add the full comments + # and write into the output the result + if is_first_time is True and i != 0: + o.write(exec_glimmer(temp_contig, first_time=is_first_time)) + is_first_time = False + # Else we call glimmer and say this is not the first time (so remove the first comment) + # and dump into the output file the result + elif i > 0: + o.write(exec_glimmer(temp_contig)) + + # Because we are on an indication of a beginning of a sequence, we need to create an empty file + # to dump the line into + with open(temp_contig, 'w+') as tc: + tc.write(line) + else: + # We are in the sequence of a contig, so we append the line in the file + with open(temp_contig, 'a+') as tc: + tc.write(line) + # The file is terminate, we did read another contig so we need to save this last one + o.write(exec_glimmer(temp_contig, first_time=is_first_time)) + +if __name__ == "__main__": + main() diff -r 000000000000 -r 0ddb5ee32ff6 multi_glimmer.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multi_glimmer.sh Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,50 @@ +#!/bin/sh +set -e + +reference_fasta=$1 +trained_dir=$2 +output=$3 +temp="temp_contig_file" + +# Write the glimmerhmm, with the comments +glimmerHMM_first () { + glimmerhmm $1 ${trained_dir} -o ${output} -g +} + +# Write the glimmerhmm output without the comments +glimmerHMM_without_comments () { + glimmerhmm $1 ${trained_dir} -g | tail -n +2 >> ${output} +} + +count=1 +# Loop through the contigs to run glimmer on each +while read line +do + # Get the content of actual contig + #samtools_faidx_show_contig ${reference_fasta} ${contig} > contig_content + first_char=$(echo ${line} | cut -c1-1) + + if [ ${first_char} = '>' ] + then + # If true, it means we have finished reading at least the first contig + if [ -f ${temp} ] + then + if [ ${count} -eq 1 ] + then + glimmerHMM_first ${temp}; + count=$((count+1)) + else + glimmerHMM_without_comments ${temp}; + fi + fi + echo ${line} > ${temp} + else + echo ${line} >> ${temp} + fi +done < "${reference_fasta}" + +# Still last contig to process +glimmerHMM_without_comments ${temp}; + +# Delete the temp_contig_file +rm ${temp} diff -r 000000000000 -r 0ddb5ee32ff6 readme.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.md Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,83 @@ +Galaxy wrapper for GlimmerHMM +===================================== + +This wrapper has been rewritten by Rémi Marenco in 2016 to fix issues, improve it and add the multi_fasta handling. It has been originally written by Björn Gruening. + +This is a wrapper for the command line tool of GlimmerHMM. +https://ccb.jhu.edu/software/glimmerhmm/ + +GlimmerHMM is a gene finder based on a Generalized Hidden Markov Model (GHMM). Although the gene finder conforms to the overall mathematical framework of a GHMM, +additionally it incorporates splice site models adapted from the GeneSplicer program and a decision tree adapted from GlimmerM. It also utilizes +Interpolated Markov Models for the coding and noncoding models. +Currently, GlimmerHMM's GHMM structure includes introns of each phase, intergenic regions, and four types of exons (initial, internal, final, and single). + +Majoros, W.H., Pertea, M., and Salzberg, S.L. TigrScan and GlimmerHMM: two open-source ab initio eukaryotic gene-finders Bioinformatics 20 2878-2879. +Pertea, M. and S. L. Salzberg (2002). "Computational gene finding in plants." Plant Molecular Biology 48(1-2): 39-48. +The Arabidopsis Genome Initiative, (2000) "Analysis of the genome sequence of the flowering plant Arabidopsis thaliana", Nature. Dec 14; 408(6814):796-815. +Pertea, M., S. L. Salzberg, et al. (2000). "Finding genes in Plasmodium falciparum." Nature 404(6773): 34; discussion 34-5. +Salzberg, S. L., M. Pertea, et al. (1999). "Interpolated Markov models for eukaryotic gene finding." Genomics 59(1): 24-31. + + +Installation +============ + +To install Glimmer3, please download GlimmerHMM from + +ftp://ccb.jhu.edu/pub/software/glimmerhmm + +and follow the installation instructions. +To extract the glimmerHMM predicted genes, the GFF Parser from Brad Chapman (ttp://github.com/chapmanb/bcbb/tree/master/gff) was used and is included. + +To install the wrapper copy the glimmerHMM folder in the galaxy tools +folder and modify the $GALAXY_ROOT/config/tool_conf.xml file to make the tool available to Galaxy. +For example: + +```xml + + +``` + +You also need to use a trained organism by adding them as reference data in Galaxy: + +1. Add the *glimmer_hmm_trained_dir* data table to `tool_data_table_conf.xml` in `$GALAXY_ROOT/config/`: + + ```xml + + + value, name, path + +
+ ``` + +2. Add the `glimmer_hmm.loc` file referencing your trained organism, in `tool-data`. + You have a sample [`glimmer_hmm.loc.sample`] available in the repository to help you configuring it properly +3. Add your data in the chosen folder at step 2. You can get them from the GlimmerHMM tar, `$GLIMMERHMM/trained_dir` + +History +======= + +- v3.0 - Add the Multi Fasta support +- v2.0 - Update by Rémi Marenco to make it work without having to modify the wrapper + add ability to select the species +- v0.1 - Initial public release + + +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. diff -r 000000000000 -r 0ddb5ee32ff6 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Aug 18 18:50:00 2016 -0400 @@ -0,0 +1,98 @@ + + + + + value, dbkey, name, path + +
+ + + value, dbkey, formats, name, path + +
+ + + value, name, path + +
+ + + value, dbkey, name, path + +
+ + + value, dbkey, name, path + +
+ + + name, value, dbkey, species + +
+ + + value, dbkey, name, path + +
+ + + value, name, path + +
+ + + value, name, path + +
+ + + value, dbkey, name, path + +
+ + + value, dbkey, name, path + +
+ + + value, dbkey, name, path + +
+ + + value, path + +
+ + + name, url, value + +
+ + + dbkey, name, value + +
+ + + value, name, url + +
+ + + value, name, url + +
+ + + value, name, url + +
+ + + value, name, path + +
+