Mercurial > repos > urgi-team > teiso
diff TEisotools-1.0/TEiso/LaunchTEiso.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/TEiso/LaunchTEiso.py Wed Jul 20 05:00:24 2016 -0400 @@ -0,0 +1,503 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.parsing.GtfParser import GtfParser +from commons.core.parsing.GffParser import GffParser +from TEiso.Bowtie_build import Bowtie_build +from TEiso.Tophat import Tophat +from TEiso.Cufflinks import Cufflinks +from TEiso.Cuffcompare import Cuffcompare +from TEiso.Bedtools_closest import Bedtools_closest +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +import os +import time +import re +import sys +LOG_NAME = "repet.TEiso" + +class LaunchTEiso(object): + + def __init__(self, reference="", input_transcripts="", single_paired="", single_reads="", left_reads="", right_reads="", transposable_element = "", assembly_tool="", verbosity=3): + self._reference = reference + self._transcripts = input_transcripts + self._type = single_paired + self._single_reads = single_reads.split(",") + self._left_reads = left_reads.split(",") + self._right_reads = right_reads.split(",") + self._TE = transposable_element + self._assembly_tool = assembly_tool + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) + + def _setAttributesFromCmdLine(self): + self._toolVersion = "0.1" + description = "TEiso version %s" % self._toolVersion + epilog = "\n if reads are single:\n LaunchTEiso.py -f <genome.fa> -g <transcripts.gtf> -e single -s <single_read> -t <TE.gff> -a cufflinks \n" + epilog += " if reads are paired:\n LaunchTEiso.py -f <genome.fa> -g <transcripts.gtf> -e paired -l <reads_left> -r <reads_right> -t <TE.gff> -a cufflinks \n" + parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) + parser.add_option('-f' , '--input_reference' , dest='input_reference' , help='file with ref sequences') + parser.add_option('-g' , '--input_transcripts', dest='input_transcripts', help='GTF/GFF with known transcripts' , default="") + parser.add_option('-e' , '--single_paired' , dest='single_paired' , help='type of input reads, single or paired end', default="paired") + parser.add_option('-s' , '--single_read' , dest='single_read' , help='a single input read' , default="") + parser.add_option('-l', '--left_read' , dest='left_read' , help='left reads' , default="") + parser.add_option('-r', '--right_read' , dest='right_read' , help='right reads' , default="") + parser.add_option('-t' , '--input_transposable_element', dest='input_transposable_element', help='GFF with known transposable_element' , default="") + parser.add_option('-a' , '--assembly_tool' , dest='assembly_tool' , help='type of RNA-Seq assembly tool' , default="cufflinks") + options = parser.parse_args()[0] + self.setAttributesFromOptions(options) + + def setAttributesFromOptions(self, options): + self._reference = options.input_reference + self._transcripts = options.input_transcripts + self._type = options.single_paired + self._single_reads = options.single_read.split(",") + self._left_reads = options.left_read.split(",") + self._right_reads = options.right_read.split(",") + self._TE = options.input_transposable_element + self._assembly_tool = options.assembly_tool + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + #raise Exception(errorMsg) + sys.exit(1) + + def checkOptions(self): + if self._type == "paired": + if self._single_reads != ['']: + self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") + if self._left_reads == ['']: + self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") + if self._right_reads == ['']: + self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") + if self._right_reads == self._left_reads: + self._logAndRaise("ERROR: -l and -r options are same!") + + if self._type == "single": + if self._left_reads != ['']: + self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") + if self._right_reads != ['']: + self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") + if self._single_reads == ['']: + self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") + + if self._reference != "": + if not FileUtils.isRessourceExists(self._reference): + self._logAndRaise("ERROR: reference file %s does not exist!" % self._reference) + else: + self._logAndRaise("ERROR: No specified -f option!") + + if self._transcripts != "": + if not FileUtils.isRessourceExists(self._transcripts): + self._logAndRaise("ERROR: transcripts file %s does not exist!" % self._transcripts) + else: + self._logAndRaise("ERROR: No specified -g option!") + + if self._TE != "": + if not FileUtils.isRessourceExists(self._TE): + self._logAndRaise("ERROR: transposable elements %s does not exist!" % self._TE) + else: + self._logAndRaise("ERROR: No specified -t option!") + + + + def getTranscriptToBed(self, inputFile,outputFile): + try: + filewrite=open(outputFile, "w") + gtfParser = GtfParser(inputFile, assemblyTools=True) + for transcript in gtfParser.getIterator(): + if(transcript.getDirection()==1): + strand="+" + else: + strand="-" + filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\t%.3f\n" % (transcript.getChromosome(),transcript.getStart(), + transcript.getEnd(), transcript.getTagValue("ID"), transcript.getTagValue("gene_id"), + strand,float(transcript.getTagValue("FPKM")) )) + + filewrite.close() + except: + self._logAndRaise("Couldn't open %s for writing" % outputFile) + + + def getTEGFFToBed(self, inputFile, outputFile): + """TODO Dont write bed line when the strand is '.' + See Gtf parser option assemblyTools + """ + try: + filewrite=open(outputFile, "w") + gffParser = GffParser(inputFile) + for transcript in gffParser.getIterator(): + if(transcript.getDirection()==1): + strand="+" + else: + strand="-" + filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript.getChromosome(),transcript.getStart(), + transcript.getEnd(), transcript.getTagValue("ID").split("_")[0]+"_", transcript.getTagValue("Target").split("_")[0], strand) ) + + filewrite.close() + except: + self._logAndRaise("Couldn't open %s for writing" % outputFile) + + + def getTEnearPromoter (self, bedtoolsfile): + #### BEdParser.py in commons is not used because the format of this bed file is different. +# #Chrom starttr endtr transcript_id gene_ID strand fpkm chromte startte endte idte targetTE strandTE distance + #scaffold_1 37570 37785 GSSPFG00034586001-RA GSSPFG00034586001 + 0.0000000000 scaffold_1 33914 40164 ms162_ PotentialHostGene - 0 + + linelist = [] + tmplist = [] + with open(bedtoolsfile, "r") as bedFile: + for line in bedFile.readlines(): + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line) + if(m != None): + start_TR = int(m.group(2))##F[1] + end_TR = int(m.group(3))##F[2] + strand_TR= m.group(6) ##[5] + start_TE = int(m.group(9))##[8] + end_TE = int(m.group(10))##[9] + dist = int(m.group(14))##[13] + if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0): + tmplist.append(line.strip()) + tmplist.append("TE_closest_TSS") + linelist.append(tmplist) + tmplist = [] + # F[1] gene F[2] + # =========================> + # ------------ + # F[8] F[9] + + if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0): + tmplist.append(line.strip()) + tmplist.append("TE_closest_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1] F[2] + # <====================== + # --------------- + + if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + overlap = (end_TE-start_TR)+1 + tmplist.append(overlap) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1] gene F[2] + # =========================> + # ------------- + # F[8] F[9] + + # gene + # F[1]=========================>F[2] + + # F[8]---------------F[9] + + + if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + ## F[1]=============================>F[2] + ## F[8]---------------F[9] + + + if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + overlap = (end_TR-start_TE)+1 + tmplist.append(overlap) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + + # F[1]<======================F[2] + # --------------- + # F[8] F[9] + # + # + # F[1]<=============================F[2] + # F[8]---------------F[9] + + if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + tmplist.append(0) + tmplist.append("TE_overlap_TSS") + linelist.append(tmplist) + tmplist = [] + + # F[1]<=============================F[2] + # F[8]---------------F[9] + + if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0): + tmplist.append(line.strip()) + tmplist.append("TE_in_gene") + linelist.append(tmplist) + tmplist = [] + + # F[1] gene F[2] + # ============================== + # ----------- + # F[8] F[9] + + + if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE): + for i in range(0,len(line.split("\t"))-1): + tmplist.append(line.split("\t")[i]) + lenTE = (end_TE-start_TE)+1 + tmplist.append(lenTE) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]======================F[2] + # F[8]----------------------------------------------------F[9] + + + if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): + tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]==================================>F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): + tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]<==================================F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): + tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]==================================>F[2] + # F[8]----------------------------------------------------------F[9] + + if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): + tmplist.append(line.strip()) + tmplist.append("gene_in_TE") + linelist.append(tmplist) + tmplist = [] + + # F[1]<==================================F[2] + # F[8]----------------------------------------------------------F[9] + + favorablecases = "%s_TSSoverlaps_and_TE_closest_TSS_and_inclus_ALL" % bedtoolsfile + w = open(favorablecases,'w') + for s in linelist: + line= "\t".join(str(item) for item in s) + w.write("%s\n" % line) + w.close() + + + def getClassCodeCuffcompare(self, tmap_file, bedtools_file): + class_code_dic = {} + lcode_ref = [] + tmp = [] + linetowrite =[] + with open(tmap_file) as tmap: + tmapline = tmap.readlines() + for i in range(1,len(tmapline)): + cuff_id = tmapline[i].split("\t")[4].strip() + class_code = tmapline[i].split("\t")[2].strip() + ref_id = tmapline[i].split("\t")[1].strip() + lcode_ref.append(class_code) + lcode_ref.append(ref_id) + class_code_dic[cuff_id] = lcode_ref + lcode_ref = [] + + with open(bedtools_file) as bedtools: + bedtoolsline = bedtools.readlines() + for i in xrange(0,len(bedtoolsline)): + tmp = bedtoolsline[i].strip().split("\t") + transcript_bedtools = bedtoolsline[i].split("\t")[3].strip() + if transcript_bedtools in class_code_dic.keys(): + tmp.append(class_code_dic[transcript_bedtools][0]) + tmp.append(class_code_dic[transcript_bedtools][1]) + else: + tmp.append("NA") + tmp.append("NA") + + linetowrite.append(tmp) + tmp=[] + + + output = "%s_with_Ref" % bedtools_file + w = open(output,'w') + line = "" + for i in xrange(0,len(linetowrite)): + for j in range(0,17): + line = line + linetowrite[i][j] + "\t" + w.write(line) + w.write("\n") + line = "" + w.close() + + def run(self): + self.checkOptions() + try: + LoggerFactory.setLevel(self._log, self._verbosity) + exeDir = os.getcwd() + workingDir = "out_TEiso_%s" % time.strftime("%Y%m%d%H%M%S") + if os.path.exists(workingDir): + self._logAndRaise("ERROR: %s already exists." % workingDir) + os.mkdir(workingDir) + referencefile = os.path.abspath(self._reference) + transcriptsfile = os.path.abspath(self._transcripts) + TEfile = os.path.abspath(self._TE) + print "workingDir >>>>> ",workingDir + os.symlink("%s" % os.path.abspath(self._reference), "%s/%s" % (workingDir, os.path.basename(self._reference))) + os.symlink("%s" % os.path.abspath(self._transcripts), "%s/%s" % (workingDir, os.path.basename(self._transcripts))) + os.chdir(workingDir) + bowtie_build_Dir = "bowtie_build" + prefixbowtie = os.path.basename(self._reference).split(".")[0] + iLaunchBowtie = Bowtie_build(referencefile, prefixbowtie, bowtie_build_Dir) + iLaunchBowtie.run() + os.chdir(exeDir) + self._log.info("Indexing genome is finished!!!!") + tophat_Dir = "tophat" + if self._type == "single": + l_single_reads = [] + for reads in range(0, len(self._single_reads)): + os.symlink("%s" % os.path.abspath(self._single_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._single_reads[reads]))) + filename = os.path.splitext(self._single_reads[reads])[0] + filetype = os.path.splitext(self._single_reads[reads])[1] + if filetype == ".gz": + os.system("gunzip -c %s > %s/%s" % (self._single_reads[reads], workingDir, os.path.basename(filename))) + if filetype == ".bz2": + os.system("bunzip2 -c %s > %s/%s" % (os.path.abspath(self._single_reads[reads]), workingDir, os.path.basename(filename))) + if filetype ==".fq": + filename = self._single_reads[reads] + l_single_reads.append("%s" % os.path.basename(filename)) + bowtiePrefix = "%s/%s" % (bowtie_build_Dir, prefixbowtie) + path = ("%s/%s") % (exeDir,workingDir) + os.chdir(path) + iLaunchTophat = Tophat(tophat_Dir, bowtiePrefix, self._type, l_single_reads, self._left_reads, self._right_reads) + iLaunchTophat.run() + if self._type == "paired": + l_left_reads = [] + l_right_reads = [] + for reads in range(0, len(self._left_reads)): + os.symlink("%s" % os.path.abspath(self._left_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._left_reads[reads]))) + filename = os.path.splitext(self._left_reads[reads])[0] + filetype = os.path.splitext(self._left_reads[reads])[1] + ##TODO : check type input file: mimetypes.guess_type(self._single_reads[reads]) + if filetype == ".gz": + os.system("gunzip -c %s > %s/%s" % (self._left_reads[reads],workingDir, os.path.basename(filename))) + if filetype == ".bz2": + os.system("bunzip2 -c %s > %s/%s" % (self._left_reads[reads],workingDir, os.path.basename(filename))) + if filetype ==".fq": + filename = self._left_reads[reads] + l_left_reads.append("%s" % os.path.basename(filename)) + + for reads in range(0, len(self._right_reads)): + os.symlink("%s" % os.path.abspath(self._right_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._right_reads[reads]))) + filename = os.path.splitext(self._right_reads[reads])[0] + filetype = os.path.splitext(self._right_reads[reads])[1] + if filetype == ".gz": + os.system("gunzip -c %s > %s/%s" % (self._right_reads[reads],workingDir, os.path.basename(filename))) + if filetype == ".bz2": + os.system("bunzip2 -c %s > %s/%s" % (self._right_reads[reads],workingDir, os.path.basename(filename))) + if filetype ==".fq": + filename = self._right_reads[reads] + l_right_reads.append("%s" % os.path.basename(filename)) + bowtiePrefix = "%s/%s" % (bowtie_build_Dir, prefixbowtie) + path= ("%s/%s") % (exeDir,workingDir) + os.chdir(path) + iLaunchTophat = Tophat(tophat_Dir, bowtiePrefix, self._type, self._single_reads, l_left_reads, l_right_reads) + iLaunchTophat.run() + self._log.info("Mapping reads is finished!!!!") + + if self._assembly_tool == "cufflinks": + cufflinks_Dir = "cufflinks" + accepted_hits = "%s/accepted_hits.bam" % tophat_Dir + iLaunchCufflinks = Cufflinks(accepted_hits, transcriptsfile , cufflinks_Dir) + iLaunchCufflinks.run() + self._log.info("%s is finished!!!!" % self._assembly_tool) + os.symlink("cufflinks/transcripts.gtf", "transcripts.gtf") + cuffcompare_Dir = "cuffcompare" + transcripts = "transcripts.gtf" + iLaunchCuffcompare = Cuffcompare(transcriptsfile, transcripts, outprefix = "cuffcompare", workingDir = cuffcompare_Dir) + iLaunchCuffcompare.run() + self._log.info("Cuffcompare is finished!!!!") + + + bedtools_closest_Dir = "bedtools_closest" + os.mkdir(bedtools_closest_Dir) + os.chdir(bedtools_closest_Dir) + + transcriptsgtf = "%s_transcripts.gtf" % self._assembly_tool + os.symlink("../%s/transcripts.gtf" % self._assembly_tool,transcriptsgtf) + transcriptsbed = "%s_transcripts.bed" % self._assembly_tool + self.getTranscriptToBed(transcriptsgtf,transcriptsbed) + TEgff = os.path.basename(os.path.splitext(TEfile)[0]) + ".gff3" + TEbed = os.path.basename(os.path.splitext(TEfile)[0]) + ".bed" + os.symlink("%s" % TEfile,TEgff) + self.getTEGFFToBed(TEgff,TEbed) + iLauncherBdc= Bedtools_closest(transcriptsbed, TEbed, "bedtools_closest_%s" % transcriptsgtf.split(".")[0]) + iLauncherBdc.run() + self._log.info("Bedtools closest is finished!!!!") + bedtoolsfile = "bedtools_closest_%s" % transcriptsgtf.split(".")[0] + self.getTEnearPromoter(bedtoolsfile) + tmap_file = "../cuffcompare/cuffcompare.transcripts.gtf.tmap" + bedtools_file = "%s_TSSoverlaps_and_TE_closest_TSS_and_inclus_ALL" % bedtoolsfile + + self.getClassCodeCuffcompare(tmap_file,bedtools_file) + os.chdir("..") + self._log.info("Done!!!!") + + except Exception: + self._logAndRaise("ERROR in TEiso") + + +if __name__ == "__main__": + iLaunch = LaunchTEiso() + iLaunch._setAttributesFromCmdLine() + iLaunch.run()