Mercurial > repos > vmarcon > repet_tedenovo

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.txt	Mon Feb 06 13:31:53 2017 -0500
@@ -0,0 +1,69 @@
+REPET dependency
+
+For REPET version 2.5.1 (REPET with some patchs, please contact urgi-contact@versailles.inra.fr to get it)
+
+REPET dependency handle the creation of essential environment variables.
+
+!!!!WARNING : REPET has to be installed!!!!
+
+
+After you have install your tool with this dependency, please go to check if defined environment variables suit with your configuration.
+Check this file :
+***tool_dependency_dir***/repet/2.5/vmarcon/package_repet_2_5/***revision***/env.sh
+Please replace with correct values :
+  * tool_dependency_dir (value in galaxy.ini file)
+  * revision
+
+Then in this file (env.sh) modify variable value in order to adjust it to your
+system :
+ - Database connexion (REPET_HOST, REPET_USER, REPET_PW, REPET_DB, REPET_PORT)
+ - Job manager (REPET_JOB_MANAGER, REPET_QUEUE)
+ - Working environment (REPET_PATH, REPET_NUCL_BANK, REPET_PROT_BANK, REPET_HMM_PROFILES, REPET_RDNA_BANK)
+
+If you want REPET working in a specific temporary directory, fill the variable REPET_TMP_DIR.
+
+If you don't want to use one or several databanks, remove the corresponding variable.
+
+If you notice a problem running Grouper on step 3, please complete LIB_GCC4_8_2 and LD_LIBRARY_PATH variable.
+
+
+-------------
+ Galaxy Page
+-------------
+
+!!!! To get the content of the page and example datasets, please ask it sendig an e-mail at urgi-contact@versailles.inra.fr . !!!!
+
+To explain in detail to your users how TEdenovo_lite works, please make a galaxy Page:
+- Connect in your Galaxy.
+- Go to the "Saved Pages" (User > Saved Pages).
+- Create a new page ("Add new Page" button in the top right corner) named 'tedenovo'.
+- Click on "tedenovo" and "Edit content".
+- Paste the content of the URGI TEdenovo page.
+- Save.
+
+Now you have to create the two Embed Galaxy Object (example on the blue and green area).
+- On the URGI TEdenovo page, download "DmelChr4.fa" by clicking on "Save dataset" (green area).
+- Upload this file on your Galaxy instance where REPET Lite is installed IN A NEW HISTORY.
+- Rename this dataset "DmelChr4.fa".
+- Launch "Repet Lite - TEdenovo" with "DmelChr4.fa" as Fasta alignment input, "Yes" to get classification informations.
+- Go back to the page in edition mode.
+- Add the green area - Embed Datasets - DmelChr4.fa
+- Add the blue area - Embed Histories - your history with TEdenovo result.
+- Save again.
+
+To publish your page:
+- Go to the list of "Saved Pages"
+- Click on "tedenovo" and "Share or Publish"
+- And "Make Page Accessible and Publish"
+
+Now this page is accessible with the URL shown and in Shared Data > Published Pages
+
+
+--------------------
+Vizualisation plugin
+--------------------
+
+We developped a plugin to vizualise the statistical output (".classif_stats.txt").
+Please send an e-mail at urgi-contact@versailles.inra.fr to get it.
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TEdenovo.sh	Mon Feb 06 13:31:53 2017 -0500
@@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+fasta=$1
+outputfasta=$2
+classif=$3
+outputlog=$4
+outputclassif=$7
+outputconfig=$5
+outputstats=$6
+
+projectname=$(date "+%Y%m%d")
+
+
+add=''
+
+if [ $classif == yes ]
+then
+  add='-c'
+fi
+`dirname $0`'/'TEdenovo_lite.py -i $fasta -o $outputfasta $add > $outputlog
+projectname_complete=$(ls $(pwd)|grep $projectname)
+working_dir=$(pwd)/$projectname_complete
+sed -i 's@'"$working_dir"'@'$projectname'@g' $outputlog
+mv $outputfasta-$projectname-denovoLibTEs_filtered.fa $outputfasta
+mv $outputfasta-$projectname-classif_stats.txt $outputstats
+if [ $classif == yes ]
+then
+  mv $outputfasta-$projectname.classif $outputclassif
+fi
+
+
+workingconfigfile=$working_dir/TEdenovo_Galaxy_config_$projectname_complete
+sed -i 's|repet_host:.*|repet_host:|g' $workingconfigfile
+sed -i 's|repet_user:.*|repet_user:|g' $workingconfigfile
+sed -i 's|repet_pw:.*|repet_pw:|g' $workingconfigfile
+sed -i 's|repet_db:.*|repet_db:|g' $workingconfigfile
+sed -i 's|repet_port:.*|repet_port:|g' $workingconfigfile
+sed -i 's|repet_job_manager:.*|repet_job_manager:|g' $workingconfigfile
+sed -i 's|project_name:.*|project_name: '$projectname'|g' $workingconfigfile
+sed -i 's|project_dir:.*|project_dir:|g' $workingconfigfile
+sed -i 's|tmpDir:.*|tmpDir:|g' $workingconfigfile
+mv $workingconfigfile $outputconfig
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TEdenovo.xml	Mon Feb 06 13:31:53 2017 -0500
@@ -0,0 +1,322 @@
+<tool id="tedenovo" name="REPET Lite - TEdenovo" version="2.2.0">
+
+    <!-- [REQUIRED] Tool description displayed after the tool name -->
+    <description> Compute a library of transposable element</description>
+
+    <!-- [OPTIONAL] 3rd party tools, binaries, modules... required for the tool to work -->
+    <requirements>
+            <requirement type="binary">python</requirement>
+            <requirement type="package" version="2.5">repet</requirement>
+            </requirements>
+
+    <!-- [STRONGLY RECOMMANDED] Exit code rules -->
+     <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" level="fatal"/>
+        <exit_code range=":-1" level="fatal"/>
+
+    </stdio>
+
+   <!-- [OPTIONAL] Command to be executed to get the tool's version string -->
+    <version_command>
+    	TEdenovo.py --version
+    </version_command>
+
+    <!-- [REQUIRED] The command to execute -->
+    <command interpreter="bash">
+    	TEdenovo.sh $fasta $outputfasta $classif $outputlog $outputconfig $outputstats
+    	#if str( $classif ) == "yes":
+    	  $outputclassif
+    	#else
+    	''
+    	#end if
+    </command>
+
+    <!-- [REQUIRED] Input files and tool parameters -->
+    <inputs>
+    	<param name="fasta" type="data" format="fasta" optional="false" label="Fasta alignment input" />
+    	<param name="classif" type="select" label="Get classification informations" help="To add the informations at annotation file on next step." >
+    		<option value="no" selected="true">No</option>
+    	        <option value="yes">Yes</option>
+    	</param>
+    	<param name="label" type="text" label="Output name" />
+    </inputs>
+
+    <!-- [REQUIRED] Output files -->
+    <outputs>
+        <data name="outputlog" type="data" format="txt" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.log" />
+    	<data name="outputfasta" type="data" format="fasta" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.TElib.fa" />
+    	<data name="outputstats" type="data" format="txt" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.classif_stats.txt" />
+    	<data name="outputclassif" type="data" format="tabular" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.classif" >
+    		<filter>(classif == 'yes')</filter>
+    	</data>
+    	<data name="outputconfig" type="data" format="txt" label="TEdenovo-#if str($label)=='' then $fasta.name else $label #.cfg" />
+    </outputs>
+
+
+    <!-- [OPTIONAL] Tests to be run manually by the Galaxy admin -->
+    <tests>
+            <!-- [HELP] Test files have to be in the ~/test-data directory -->
+        <test>
+            <param name="fasta" value="DmelChr4Chr3.fa" />
+            <output name="outputfasta">
+            	<assert_contents>
+            		<has_line_matching expression="^>\w+" />
+            		<has_line_matching expression="[ACTG]{60}" />
+            	</assert_contents>
+            </output>
+            <output name="outputlog">
+                <assert_contents>
+                        <has_line_matching expression="^step 7 finished successfully" />
+                        <has_line_matching expression="^END time: \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" />
+			<has_line_matching expression="^Writing fasta file" />
+                </assert_contents>
+            </output>
+            <output name="outputconfig">
+            	<assert_contents>
+            		<has_line_matching expression="^project_name: \d{8}" />
+            		<has_line_matching expression="^repet_version: 2.5" />
+            		<has_line_matching expression="^tmpDir:" />
+            		<has_line_matching expression="^clean: yes" />
+            	</assert_contents>
+            </output>
+            <output name="outputstats">
+            	<assert_contents>
+            		<has_line_matching expression="-------Summary---------" />
+            		<has_line_matching expression="TOTAL: \d+ (\d+.\d+%)" />
+            		<has_line_matching expression="------NOTES---------" />
+            	</assert_contents>
+            </output>
+        </test>
+	<test>
+	    <param name="fasta" value="DmelChr4Chr3.fa" />
+	    <param name="classif" value="yes" />
+	    <output name="outputfasta">
+	        <assert_contents>
+	                <has_line_matching expression="^>\w+" />
+	                <has_line_matching expression="[ACTG]{60}" />
+	        </assert_contents>
+	    </output>
+	    <output name="outputlog">
+	        <assert_contents>
+	                <has_line_matching expression="^step 7 finished successfully" />
+	                <has_line_matching expression="^END time: \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}" />
+	                <has_line_matching expression="^Writing fasta file" />
+	        </assert_contents>
+	    </output>
+	    <output name="outputconfig">
+	        <assert_contents>
+	                <has_line_matching expression="^project_name: \d{8}" />
+	                <has_line_matching expression="^repet_version: 2.5" />
+	                <has_line_matching expression="^tmpDir:" />
+	                <has_line_matching expression="^clean: yes" />
+	        </assert_contents>
+	    </output>
+	    <output name="outputstats">
+	        <assert_contents>
+	                <has_line_matching expression="-------Summary---------" />
+	                <has_line_matching expression="TOTAL: \d+ (\d+.\d+%)" />
+	                <has_line_matching expression="------NOTES---------" />
+	        </assert_contents>
+	    </output>
+	    <output name="outputclassif">
+	        <assert_contents>
+	        	<has_n_columns n="8" />
+	        </assert_contents>
+	    </output>
+	</test>
+    </tests>
+
+    <!-- [OPTIONAL] Help displayed in Galaxy -->
+    <help>
+<![CDATA[
+.. class:: infomark
+
+
+**Authors**
+Gwendoline Andres
+Valentin Marcon
+Veronique Jamilloux
+Olivier Inizan
+
+---------------------------------------------------
+
+.. class:: infomark
+
+**Please cite** If you use this tool, please cite
+
+---------------------------------------------------
+
+==============
+TEdenovo Lite
+==============
+
+-----------
+Description
+-----------
+  REPET is for detection and annotation of transposable elements (TE). The ligth version available on Galaxy is specialised on transposable element masking.
+  TEdenovo is the first step to constitute a consensus library of TE.
+  For a detailed description of each parameter used, please consult the Galaxy page in "Shared Data > Published Pages"
+
+-----------------
+Workflow position
+-----------------
+
+**Downstream tools**
+
+=========== ========================== =======
+Name            output file(s)         format
+=========== ========================== =======
+TEannot     GFF with TE masked         gff
+=========== ========================== =======
+
+
+----------
+Input file
+----------
+
+Fasta file
+	Genome file at fasta format
+
+
+------------
+Output files
+------------
+
+Output_name.fa
+	TE library
+
+Output_name.log
+	log file to see each steps progress
+
+Output_name.cfg
+	File to show which params have been used
+
+Output_name.classif_stats.txt
+	File with statistics you can visualize
+
+Output_name.classif
+	If asked, the classification file to give to next step (TEannot)
+
+
+---------------
+Working example
+---------------
+
+Input files
+===========
+
+Fasta file
+-----------
+
+::
+
+	>dmel_chr4
+	GAATTCGCGTCCGCTTACCCATGTGCCTGTGGATGCCGAACAGGAGGCGCCGTTGACGGC
+	GAATGACTTACTCAAGGGAGTAGCCAATCTGTCGGATACGCCCGGATTGGAGCTGCCCAT
+	GGAGGGTTCTACAAGAAAGCGGTGGAGGATTGCTCGCATACTGCGAGACCGTTTCTGAAG
+	GAGATGGCTCATGGAGTACCTGCCTACGCTTGTGCGCCGCGAGAAGTGGTGAAGAAGAAC
+	GGAGCCCATACACCAGGGTGATATGGTCTTCGTCTGCGATCCCGCCTTGCCCCGGCGAGA
+	GTGGTGCAAGGGCATCATGGAGGAAGTCTCCAGCAGAGCAGATGGAGCAACGGCCTATAG
+	AGGACACTGATGCTACCCGTCTCTAAGCTTGCAGTTTTGGATTTAAGTGAATCGGTTATT
+	CACGGGGTCGGGGATGTCGCGGATCGAACGGTGCAATCGATAGGCGTAATCAGTATTTCC
+	AGATAGTGATAAGATTTGGTGGATAAATGTGTGCGGGCACACTAATGGCCGCCATCGTAA
+	GCCGCGAAAAGCTTAGCGTGCATTGTCGATCGAGAGTTTGGAGGGCAAACTGCGGTAAGA
+	TAAGATTAAATAATTTGTACTGAATAATCTTAAAGAATCCTGATGGAAAGCGCCATGCAG
+	TCACATATAATATGTGCAGAGCTCTCCTC
+
+
+Output files
+============
+
+output fasta : TE library
+-------------------------
+
+::
+
+	>DTX-incomp_20150313101806-B-G1-Map3
+	ATACAGCTGCGGTTAAAATAATAGCACTACTGCAGGTGGAAAGTTGATTTCCTAAAAAAA
+	ATTATTAAATGTTTATATTTTTTTAAGTCAGATTGCATGAATAATAAGTACCATATGTTG
+	GCTCTCTGAGCAAGAAATTTTTAG
+	>RLX-incomp_20150313101806-B-R12-Map3_reversed
+	ATGATAAGTAGGCAAACTATAAAAATGTTCTATTTATGGGCTGCAATAAACATGTCACCG
+	GACAGCATAAGTGGCAACTACAG
+
+
+output config : .cfg
+--------------------
+
+::
+
+	[repet_env]
+	repet_version: 2.4
+	repet_host: ******
+	repet_user: ******
+
+
+output stats : .classif_stats.txt
+---------------------------------
+
+::
+
+	LTR incomp: 1 (10.00%)
+	LTR total (RLX): 1 (10.00%)
+
+	ClassI + one order: 1 (10.00%)
+	ClassI total (RXX): 1 (10.00%)
+
+	-------------------------Summary--------------------------------
+
+	RXX: 1 (10.00%)
+	DXX: 9 (90.00%)
+	TOTAL: 10 (100.00%)
+
+	-----------------------------NOTES------------------------------
+
+
+output classif : Classification file
+------------------------------------
+
+::
+
+	DTX-incomp_dataset_370.dat-B-G1-Map3	542	+	ok	II	TIR	incomplete	CI=37; coding=(TE_BLRtx: TC1_DM:ClassII:TIR:Tc1-Mariner: 32.59%; TE_BLRx: Mariner-1_DAn_1p:ClassII:TIR:Tc1-Mariner: 18.43%); struct=(TElength: <700bps); other=(Other_profiles: PF13936.1_HTH_38_NA_OTHER_27.0: 77.27%(77.27%); SSRCoverage=0.03)
+	DTX-comp_dataset_370.dat-B-G8-Map20	1244	+	ok	II	TIR	complete	CI=50; coding=(TE_BLRtx: PROTOP:ClassII:TIR:P: 12.03%, PROTOP_A:ClassII:TIR:P: 49.14%); struct=(TElength: >1000bps; TermRepeats: termTIR: 50); other=(SSRCoverage=0.25)
+
+]]>
+    </help>
+
+    <citations>
+	<citation type="bibtex"><![CDATA[@article{10.1371/journal.pone.0016526,
+	    author = {Flutre, Timothée AND Duprat, Elodie AND Feuillet, Catherine AND Quesneville, Hadi},
+	    journal = {PLoS ONE},
+	    publisher = {Public Library of Science},
+	    title = {Considering Transposable Element Diversification in <italic>De Novo</italic> Annotation Approaches},
+	    year = {2011},
+	    month = {01},
+	    volume = {6},
+	    url = {http://dx.doi.org/10.1371%2Fjournal.pone.0016526},
+	    pages = {e16526},
+	    abstract = {
+	    <p>Transposable elements (TEs) are mobile, repetitive DNA sequences that are almost ubiquitous in prokaryotic and eukaryotic genomes. They have a large impact on genome structure, function and evolution. With the recent development of high-throughput sequencing methods, many genome sequences have become available, making possible comparative studies of TE dynamics at an unprecedented scale. Several methods have been proposed for the <italic>de novo</italic> identification of TEs in sequenced genomes. Most begin with the detection of genomic repeats, but the subsequent steps for defining TE families differ. High-quality TE annotations are available for the <italic>Drosophila melanogaster</italic> and <italic>Arabidopsis thaliana</italic> genome sequences, providing a solid basis for the benchmarking of such methods. We compared the performance of specific algorithms for the clustering of interspersed repeats and found that only a particular combination of algorithms detected TE families with good recovery of the reference sequences. We then applied a new procedure for reconciling the different clustering results and classifying TE sequences. The whole approach was implemented in a pipeline using the REPET package. Finally, we show that our combined approach highlights the dynamics of well defined TE families by making it possible to identify structural variations among their copies. This approach makes it possible to annotate TE families and to study their diversification in a single analysis, improving our understanding of TE dynamics at the whole-genome scale and for diverse species.</p>
+	    },
+	    number = {1},
+	    doi = {10.1371/journal.pone.0016526}
+	}]]></citation>
+	<citation type="bibtex"><![CDATA[@article{10.1371/journal.pone.0094101,
+    author = {Maumus, Florian AND Quesneville, Hadi},
+    journal = {PLoS ONE},
+    publisher = {Public Library of Science},
+    title = {Deep Investigation of <italic>Arabidopsis thaliana</italic> Junk DNA Reveals a Continuum between Repetitive Elements and Genomic Dark Matter},
+    year = {2014},
+    month = {04},
+    volume = {9},
+    url = {http://dx.doi.org/10.1371%2Fjournal.pone.0094101},
+    pages = {e94101},
+    abstract = {<p>Eukaryotic genomes contain highly variable amounts of DNA with no apparent function. This so-called junk DNA is composed of two components: repeated and repeat-derived sequences (together referred to as the repeatome), and non-annotated sequences also known as genomic dark matter. Because of their high duplication rates as compared to other genomic features, transposable elements are predominant contributors to the repeatome and the products of their decay is thought to be a major source of genomic dark matter. Determining the origin and composition of junk DNA is thus important to help understanding genome evolution as well as host biology. In this study, we have used a combination of tools enabling to show that the repeatome from the small and reducing <italic>A. thaliana</italic> genome is significantly larger than previously thought. Furthermore, we present the concepts and results from a series of innovative approaches suggesting that a significant amount of the <italic>A. thaliana</italic> dark matter is of repetitive origin. As a tentative standard for the community, we propose a deep compendium annotation of the <italic>A. thaliana</italic> repeatome that may help addressing farther genome evolution as well as transcriptional and epigenetic regulation in this model plant.</p>},
+    number = {4},
+    doi = {10.1371/journal.pone.0094101}
+	}]]></citation>
+    </citations>
+
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TEdenovo_lite.py	Mon Feb 06 13:31:53 2017 -0500
@@ -0,0 +1,406 @@
+#!/usr/bin/env python
+
+
+import os
+import sys
+import time
+import glob
+import shutil
+import ConfigParser
+from commons.core.seq.FastaUtils import *
+import operator
+import re
+
+
+
+if not "REPET_PATH" in os.environ.keys():
+    print "ERROR: no environment variable REPET_PATH"
+    sys.exit(1)
+
+if (not "REPET_DB" in os.environ.keys()) or (not "REPET_HOST" in os.environ.keys()) or (not "REPET_PORT" in os.environ.keys()) or (not "REPET_USER" in os.environ.keys()) or (not "REPET_PW" in os.environ.keys()):
+    print "ERROR: there is at least one environment database variable missing : REPET_DB, REPET_PORT, REPET_HOST, REPET_USER or REPET_PW"
+    sys.exit(1)
+
+if not "REPET_JOB_MANAGER" in os.environ.keys():
+    print "ERROR: no environment variable REPET_JOB_MANAGER"
+    sys.exit(1)
+
+
+if not "%s/bin" % os.environ["REPET_PATH"] in os.environ["PATH"]:
+    os.environ["PATH"] = "%s/bin:%s" % (os.environ["REPET_PATH"], os.environ["PATH"])
+
+sys.path.append(os.environ["REPET_PATH"])
+if not "PYTHONPATH" in os.environ.keys():
+    os.environ["PYTHONPATH"] = os.environ["REPET_PATH"]
+else:
+    os.environ["PYTHONPATH"] = "%s:%s" % (os.environ["REPET_PATH"], os.environ["PYTHONPATH"])
+
+from commons.core.LoggerFactory import LoggerFactory
+from commons.core.checker.RepetException import RepetException
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.core.seq.FastaUtils import FastaUtils
+from commons.core.sql.DbFactory import DbFactory
+from itertools import islice
+
+LOG_DEPTH = "TEdenovo.pipeline"
+
+class TEdenovo_lite(object):
+
+    def __init__(self, configFileName = "", fastaFileName = "", verbosity = 0):
+        self._configFileName = configFileName
+        self._fastaFileName = os.path.abspath(fastaFileName)
+        self._projectName = time.strftime("%Y%m%d%H%M%S")
+        self._limitSeqSize = 200000000
+
+	if "REPET_NUCL_BANK" in os.environ.keys():
+	    if os.path.exists(os.environ["REPET_NUCL_BANK"]):
+        	self._nucl_bank = os.environ["REPET_NUCL_BANK"]
+	    else :
+		print "ERROR : the nucleotides bank configured doesn't exist. Please correct it in the REPET_NUCL_BANK variable"
+                sys.exit(1)
+	else :
+	    self._nucl_bank = ""
+	if "REPET_PROT_BANK" in os.environ.keys():
+            if os.path.exists(os.environ["REPET_PROT_BANK"]):
+		self._prot_bank = os.environ["REPET_PROT_BANK"]
+	    else :
+                print "ERROR : the proteins bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable"
+                sys.exit(1)
+	else :
+	     self._prot_bank = ""
+	if "REPET_HMM_PROFILES" in os.environ.keys():
+            if os.path.exists(os.environ["REPET_HMM_PROFILES"]):
+        	self._HMM_profiles = os.environ["REPET_HMM_PROFILES"]
+            else :
+                print "ERROR : the hmm profiles bank configured doesn't exist. Please correct it in the REPET_HMM_PROFILES variable"
+                sys.exit(1)
+	else :
+	    self._HMM_profiles = ""
+	if "REPET_RDNA_BANK" in os.environ.keys():
+            if os.path.exists(os.environ["REPET_RDNA_BANK"]):
+        	self._rdna_bank = os.environ["REPET_RDNA_BANK"]
+            else :
+                print "ERROR : the rDNA bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable"
+                sys.exit(1)
+	else :
+            self._rdna_bank = ""
+	if self._nucl_bank == "" and self._prot_bank == "" and self._HMM_profiles == "" and self._rdna_bank == ""  :
+	    print "WARNING : No bank are configured ... To set banks please add REPET_NUCL_BANK, REPET_PROT_BANK, REPET_HMM_PROFILES and/or REPET_RDNA_BANK in your environment"
+	if "REPET_TMP_DIR" in os.environ.keys():
+	    self._tmp_dir = os.environ["REPET_TMP_DIR"]
+	else :
+	    self._tmp_dir = ""
+        self._outputFasta = ""
+        self._classif = False
+        self._outputClassif = ""
+	self._outputStats = ""
+        self._verbosity = verbosity
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
+
+    def setAttributesFromCommandLine(self):
+        description = "This script is a ligth version of TEdenovo. It writes configuration file and launches TEdenovo."
+        epilog = "Example: TEdenovo_lite.py -i fastaFileName \n"
+        version = "2.0"
+        parser = RepetOptionParser(description = description, epilog = epilog, version = version)
+        parser.add_option("-i", "--fasta",           dest = "fastaFileName" , action = "store" , type = "string", help ="input fasta file name ", default = "")
+        parser.add_option("-c", "--withClassif",      dest="withClassif",	action="store_true",	help = " Get classification files in output.", default = False)
+        parser.add_option("-o", "--output",      dest="outputLabel"      , action = "store", type = "string", help = "[optional] Prefix label for output file(s).", default = "")
+        parser.add_option("-v", "--verbosity",       dest = "verbosity",      action = "store", type = "int",    help = "Verbosity [optional] [default: 2]", default = 2)
+        options = parser.parse_args()[0]
+        self._setAttributesFromOptions(options)
+
+    def _setAttributesFromOptions(self, options):
+        self.setConfigFileName("")
+        if options.fastaFileName=="":
+            print "ERROR : You have to enter an input fasta file"
+            print "Example: TEdenovo_lite.py -i fastaFileName \n"
+            print "More option : TEdenovo_lite.py --help "
+            exit(1)
+        else :
+            self._fastaFileName = os.path.abspath(options.fastaFileName)
+        if options.outputLabel=="":
+            fastaBaseName=os.path.abspath(re.search(r'([^\/\\]*)\.[fa|fasta|fsa|fas]',options.fastaFileName).groups()[0])
+            options.outputLabel=fastaBaseName
+        self._outputFasta = os.path.abspath(options.outputLabel+"-%s-denovoLibTEs_filtered.fa"%self._projectName[:8])
+	self._outputStats = os.path.abspath(options.outputLabel+"-%s-classif_stats.txt"%self._projectName[:8])
+        self._verbosity = options.verbosity
+        if options.withClassif :
+            self._classif=True
+            self._outputClassif = os.path.abspath(options.outputLabel+'-%s.classif'%self._projectName[:8])
+
+    def setConfigFileName(self, configFileName):
+        self._configFileName = configFileName
+        if not self._configFileName:
+            self._configFileName = "TEdenovo_Galaxy_config_%s" % self._projectName
+
+    def setAttributesFromConfigFile(self, configFileName):
+        config = ConfigParser.ConfigParser()
+        config.readfp( open(configFileName) )
+
+
+    def _writeConfigFile(self):
+        if FileUtils.isRessourceExists(self._configFileName):
+            self._logAndRaise("Configuration file '%s' already exists. Won't be overwritten.")
+
+        shutil.copy("%s/config/TEdenovo.cfg" % os.environ.get("REPET_PATH"), self._configFileName)
+        self.setAttributesFromConfigFile(self._configFileName)
+
+        os.system("sed -i 's|repet_host: <your_MySQL_host>|repet_host: %s|' %s" % (os.environ["REPET_HOST"], self._configFileName))
+        os.system("sed -i 's|repet_user: <your_MySQL_login>|repet_user: %s|' %s" % (os.environ["REPET_USER"], self._configFileName))
+        os.system("sed -i 's|repet_pw: <your_MySQL_password>|repet_pw: %s|' %s" % (os.environ["REPET_PW"], self._configFileName))
+        os.system("sed -i 's|repet_db: <your_MySQL_db>|repet_db: %s|' %s" % (os.environ["REPET_DB"], self._configFileName))
+        os.system("sed -i 's|repet_port: 3306|repet_port: %s|' %s" % (os.environ["REPET_PORT"], self._configFileName))
+        os.system("sed -i 's|repet_job_manager: SGE|repet_job_manager: %s|' %s" % (os.environ["REPET_JOB_MANAGER"], self._configFileName))
+        os.system("sed -i 's|project_name: <your_project_name>|project_name: %s|' %s" % (self._projectName, self._configFileName))
+        os.system("sed -i 's|project_dir: <absolute_path_to_your_project_directory>|project_dir: %s|' %s" % (os.getcwd().replace("/", "\/"), self._configFileName))
+        os.system("sed -i 's|tmpDir:|tmpDir: %s|g' %s" % (self._tmp_dir, self._configFileName))
+
+        if  self._nucl_bank != "" and self._nucl_bank != None:
+            os.system("sed -i 's|TE_BLRn: no|TE_BLRn: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|TE_BLRtx: no|TE_BLRtx: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|TE_nucl_bank: <bank_of_TE_nucleotide_sequences_such_as_Repbase>|TE_nucl_bank: %s|' %s" % (os.path.basename(self._nucl_bank), self._configFileName))
+
+        if  self._prot_bank != "" and self._prot_bank != None:
+            os.system("sed -i 's|TE_BLRx: no|TE_BLRx: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|TE_prot_bank: <bank_of_TE_amino-acid_sequences_such_as_Repbase>|TE_prot_bank: %s|' %s" % (os.path.basename(self._prot_bank), self._configFileName))
+
+        if  self._HMM_profiles != "" and self._HMM_profiles != None:
+            os.system("sed -i 's|TE_HMMER: no|TE_HMMER: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|TE_HMM_profiles: <bank_of_HMM_profiles>|TE_HMM_profiles: %s|' %s" % (os.path.basename(self._HMM_profiles),self._configFileName))
+
+        if  self._rdna_bank != "" and self._rdna_bank != None:
+            os.system("sed -i 's|rDNA_BLRn: no|rDNA_BLRn: yes|' %s" %  self._configFileName)
+            os.system("sed -i 's|rDNA_bank: <bank_of_rDNA_sequences_from_eukaryota>|rDNA_bank: %s|' %s" % (os.path.basename(self._rdna_bank),self._configFileName))
+
+        os.system("sed -i 's|filter_host_gene: no|filter_host_gene: yes|' %s" % (self._configFileName))
+
+
+    def removeNstretches(self,maxNstretchesSize=11,minContigsize=10000):
+	if self._verbosity > 0:
+            print "Removing Nstretches longer than %d pb and removing conting shorter than %d pb"%(maxNstretchesSize,minContigsize)
+        t0=time.time()
+        Nstretches=FastaUtils.getNstretchesRangesList(self._fastaFileName,maxNstretchesSize)
+        t1=time.time()
+        refBSDB = BioseqDB(self._fastaFileName)
+        t3=time.time()
+        debut=1
+        t2=time.time()
+        if len(Nstretches)>0:
+            currentchrom=Nstretches[0].seqname
+            refBS=refBSDB.fetch(currentchrom)
+        t3=time.time()
+        newBSDB = BioseqDB()
+        i=0
+        seqInNstretches = []
+        for Nstretch in Nstretches :
+            i+=1
+            tmpBS=""
+            if Nstretch.seqname not in seqInNstretches :
+                seqInNstretches.append(Nstretch.seqname)
+            if currentchrom==Nstretch.seqname:
+                    fin=Nstretch.start-1
+                    size=fin-debut+1
+                    if size>minContigsize :
+                            tmpBS=refBS.subseq(debut,fin)
+                            newBSDB.add(tmpBS)
+
+                    debut=Nstretch.end+1
+
+            else :
+                    fin = refBSDB.getSeqLength(currentchrom)
+                    size=fin-debut+1
+                    if size>minContigsize :
+                        tmpBS=refBS.subseq(debut,fin)
+                        newBSDB.add(tmpBS)
+                    currentchrom=Nstretch.seqname
+                    refBS=refBSDB.fetch(currentchrom)
+                    debut=1
+                    fin==Nstretch.start
+                    size=fin-debut+1
+                    if size>minContigsize :
+                        tmpBS=refBS.subseq(debut,fin)
+                        newBSDB.add(tmpBS)
+                    debut=Nstretch.end+1
+
+        if len(Nstretches)>0:
+            fin = refBSDB.getSeqLength(currentchrom)
+            size=fin-debut+1
+            if size>minContigsize :
+                tmpBS=refBS.subseq(debut,fin)
+                newBSDB.add(tmpBS)
+
+        for refName in refBSDB.getHeaderList() :
+            if refName not in seqInNstretches:
+                debut=1
+                fin=refBSDB.getSeqLength(refName)
+                size=fin-debut+1
+                if size>minContigsize :
+                    refBS=refBSDB.fetch(refName)
+                    tmpBS=refBS.subseq(debut,fin)
+                    newBSDB.add(tmpBS)
+
+
+        t5b=time.time()
+        if self._verbosity >= 2:
+            print "%s contigs selected from %s scaffolds"%(newBSDB.getSize(),refBSDB.getSize())
+        #newBSDB.sortByLength(reverse=True)
+
+        return newBSDB
+
+
+
+#TODO refactoring about min size of genome for preprocess
+    def selectContigs4givenSize(self,BSDB,limit=200000000):
+        if self._verbosity > 0:
+            print "Selecting contigs to reach %s pb "%limit
+        contigsHeadersAndLength=zip(BSDB.getHeaderList(),BSDB.getListOfSequencesLength())
+        size=0
+        size_small=0
+        size_big=500000000
+        lselectedContigs=[]
+
+        for seq in BSDB.db :
+            size+=seq.getLength()
+            if size<limit:
+                lselectedContigs.append(seq)
+                size_small=size
+            else :
+                size_big=size
+                break
+
+        if size_big-limit<limit-size_small :
+            lselectedContigs.append(seq)
+
+        if self._verbosity > 0:
+            print "%s contigs selected to reach %s pb (%s contigs initially) "%(len(lselectedContigs),limit,len(contigsHeadersAndLength))
+
+        selectedContigsBSDB=BioseqDB()
+        selectedContigsBSDB.setData(lselectedContigs)
+        return selectedContigsBSDB
+
+    def writeFastaInput(self,BSDB,outFileName=''):
+        if self._verbosity > 0:
+            print "Writing fasta file"
+
+        if not outFileName:
+            outFileName = self._projectName + ".fastaExtract"
+
+        BSDB.save(outFileName)
+        if self._verbosity > 0:
+            print '%d sequences saved.'%BSDB.getSize()
+
+        return outFileName
+
+    def correctHeader(self,BSDB):
+        if self._verbosity > 0:
+            print "Correcting fasta headers"
+        replacedSeqNb=0
+        for header in BSDB.getHeaderList() :
+            p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE)
+            if p.search(header):
+                sub=list(set(p.findall(header)))
+                correctedHeader=header
+                for s in sub :
+                    correctedHeader=correctedHeader.replace(s,'_')
+                if self._verbosity>2:
+                    print "Correct Header : '%s' replaced by '%s'"%(header,correctedHeader)
+                BSDB.fetch(header).setHeader(correctedHeader)
+                replacedSeqNb+=1
+        if self._verbosity > 0:
+            print '%s sequence headers corrected'%replacedSeqNb
+        return BSDB
+
+
+    def _launchTEdenovo(self):
+        print "START time: %s" % time.strftime("%Y-%m-%d %H:%M:%S")
+        lCmds = []
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 1 -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 2 -s Blaster -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Grouper -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Recon -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Piler -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Grouper -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Recon -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Piler -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 5 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 6 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+        lCmds.append( "TEdenovo.py -P %s -C %s -S 7 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
+
+        for cmd in lCmds:
+            returnValue = os.system(cmd)
+            if returnValue != 0:
+                print "ERROR: command '%s' returned %i" % (cmd, returnValue)
+                self._cleanTables()
+                sys.exit(1)
+
+        print "END time: %s" % time.strftime("%Y-%m-%d %H:%M:%S")
+        outFastaFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*_denovoLibTEs_filtered.fa"%self._projectName)
+        shutil.copy(outFastaFile[0], self._outputFasta)
+	outStatsFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*.classif_stats.txt"%self._projectName)
+        shutil.copy(outStatsFile[0], self._outputStats)
+        if self._classif:
+            outClassifFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif/classifConsensus/*_withoutRedundancy_negStrandReversed_WickerH.classif"%self._projectName)
+            shutil.copy(outClassifFile[0], self._outputClassif)
+        self._renameTE()
+
+    def _renameTE(self):
+        name=re.search(r'([^\/\\]*)-\d{8}-denovoLibTEs_filtered\.[fa|fasta|fsa|fas]',self._outputFasta).groups()[0]
+        os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputFasta))
+        if self._classif:
+            os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputClassif))
+
+    def preprocessFastaFile(self):
+        inFileHandler = open(self._fastaFileName, "r")
+        cumulLength = FastaUtils.dbCumLength(inFileHandler)
+        inFileHandler.close()
+        if cumulLength >= self._limitSeqSize:
+            print "Preprocess lauched"
+            allContigsBSDB=self.removeNstretches()
+            selectedContigsBSDB=self.selectContigs4givenSize(allContigsBSDB)
+            self.correctHeader(selectedContigsBSDB)
+            fastaFile=self.writeFastaInput(selectedContigsBSDB)
+            print "Preprocess finished"
+        else:
+            fastaFile=self._fastaFileName
+            print "No preprocess : the genome size %s lower than %s Mbp" % (cumulLength, self._limitSeqSize/1000000)
+        os.symlink(fastaFile,"%s/%s.fa" %(os.getcwd(),self._projectName)) #creer repertoire projet
+
+    def _launchListAndDropTables(self):
+        cmd = "ListAndDropTables.py"
+        cmd += " -C %s" % self._configFileName
+        cmd += " -d '%s'" % self._projectName
+        os.system(cmd)
+
+    def _cleanJobsTable(self):
+        db = DbFactory.createInstance( configFileName = self._configFileName )
+        sql_cmd="DELETE FROM jobs WHERE groupid like '%s%%';"%self._projectName
+        db.execute( sql_cmd )
+        db.close()
+
+    def _cleanTables(self):
+        self._launchListAndDropTables()
+        self. _cleanJobsTable()
+
+
+    def run(self):
+        os.mkdir(self._projectName)
+        os.chdir(self._projectName)
+        self._writeConfigFile()
+        self.preprocessFastaFile()
+        if  self._nucl_bank != "" and self._nucl_bank != None:
+            os.symlink(self._nucl_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._nucl_bank)))
+        if  self._prot_bank != "" and self._prot_bank != None:
+            os.symlink(self._prot_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._prot_bank)))
+        if  self._HMM_profiles != "" and self._HMM_profiles != None:
+            os.symlink(self._HMM_profiles,"%s/%s" %(os.getcwd(),os.path.basename(self._HMM_profiles)))
+        if  self._rdna_bank != "" and self._rdna_bank != None:
+            os.symlink(self._rdna_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._rdna_bank)))
+
+        self._launchTEdenovo()
+        self._cleanTables()
+
+if __name__ == '__main__':
+    iTEdenovo = TEdenovo_lite()
+    iTEdenovo.setAttributesFromCommandLine()
+    iTEdenovo.run()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Feb 06 13:31:53 2017 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="repet" version="2.5">
+	<repository changeset_revision="f3b6009634b1" name="package_repet_2_5" owner="vmarcon" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>