Mercurial > repos > urgi-team > teiso
diff TEisotools-1.1.a/TEiso/LaunchTEiso.py @ 16:836ce3d9d47a draft default tip
Uploaded
author | urgi-team |
---|---|
date | Thu, 21 Jul 2016 07:42:47 -0400 |
parents | 255c852351c5 |
children |
line wrap: on
line diff
--- a/TEisotools-1.1.a/TEiso/LaunchTEiso.py Thu Jul 21 07:36:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,503 +0,0 @@ -#!/usr/bin/env python - -# Copyright INRA (Institut National de la Recherche Agronomique) -# http://www.inra.fr -# http://urgi.versailles.inra.fr -# -# This software is governed by the CeCILL license under French law and -# abiding by the rules of distribution of free software. You can use, -# modify and/ or redistribute the software under the terms of the CeCILL -# license as circulated by CEA, CNRS and INRIA at the following URL -# "http://www.cecill.info". -# -# As a counterpart to the access to the source code and rights to copy, -# modify and redistribute granted by the license, users are provided only -# with a limited warranty and the software's author, the holder of the -# economic rights, and the successive licensors have only limited -# liability. -# -# In this respect, the user's attention is drawn to the risks associated -# with loading, using, modifying and/or developing or reproducing the -# software by the user in light of its specific status of free software, -# that may mean that it is complicated to manipulate, and that also -# therefore means that it is reserved for developers and experienced -# professionals having in-depth computer knowledge. Users are therefore -# encouraged to load and test the software's suitability as regards their -# requirements in conditions enabling the security of their systems and/or -# data to be ensured and, more generally, to use and operate it in the -# same conditions as regards security. -# -# The fact that you are presently reading this means that you have had -# knowledge of the CeCILL license and that you accept its terms. - - -from commons.core.parsing.GtfParser import GtfParser -from commons.core.parsing.GffParser import GffParser -from TEiso.Bowtie_build import Bowtie_build -from TEiso.Tophat import Tophat -from TEiso.Cufflinks import Cufflinks -from TEiso.Cuffcompare import Cuffcompare -from TEiso.Bedtools_closest import Bedtools_closest -from commons.core.LoggerFactory import LoggerFactory -from commons.core.utils.RepetOptionParser import RepetOptionParser -from commons.core.utils.FileUtils import FileUtils -import os -import time -import re -import sys -LOG_NAME = "repet.TEiso" - -class LaunchTEiso(object): - - def __init__(self, reference="", input_transcripts="", single_paired="", single_reads="", left_reads="", right_reads="", transposable_element = "", assembly_tool="", verbosity=3): - self._reference = reference - self._transcripts = input_transcripts - self._type = single_paired - self._single_reads = single_reads.split(",") - self._left_reads = left_reads.split(",") - self._right_reads = right_reads.split(",") - self._TE = transposable_element - self._assembly_tool = assembly_tool - self._verbosity = verbosity - self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) - - def _setAttributesFromCmdLine(self): - self._toolVersion = "0.1" - description = "TEiso version %s" % self._toolVersion - epilog = "\n if reads are single:\n LaunchTEiso.py -f <genome.fa> -g <transcripts.gtf> -e single -s <single_read> -t <TE.gff> -a cufflinks \n" - epilog += " if reads are paired:\n LaunchTEiso.py -f <genome.fa> -g <transcripts.gtf> -e paired -l <reads_left> -r <reads_right> -t <TE.gff> -a cufflinks \n" - parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) - parser.add_option('-f' , '--input_reference' , dest='input_reference' , help='file with ref sequences') - parser.add_option('-g' , '--input_transcripts', dest='input_transcripts', help='GTF/GFF with known transcripts' , default="") - parser.add_option('-e' , '--single_paired' , dest='single_paired' , help='type of input reads, single or paired end', default="paired") - parser.add_option('-s' , '--single_read' , dest='single_read' , help='a single input read' , default="") - parser.add_option('-l', '--left_read' , dest='left_read' , help='left reads' , default="") - parser.add_option('-r', '--right_read' , dest='right_read' , help='right reads' , default="") - parser.add_option('-t' , '--input_transposable_element', dest='input_transposable_element', help='GFF with known transposable_element' , default="") - parser.add_option('-a' , '--assembly_tool' , dest='assembly_tool' , help='type of RNA-Seq assembly tool' , default="cufflinks") - options = parser.parse_args()[0] - self.setAttributesFromOptions(options) - - def setAttributesFromOptions(self, options): - self._reference = options.input_reference - self._transcripts = options.input_transcripts - self._type = options.single_paired - self._single_reads = options.single_read.split(",") - self._left_reads = options.left_read.split(",") - self._right_reads = options.right_read.split(",") - self._TE = options.input_transposable_element - self._assembly_tool = options.assembly_tool - - def _logAndRaise(self, errorMsg): - self._log.error(errorMsg) - #raise Exception(errorMsg) - sys.exit(1) - - def checkOptions(self): - if self._type == "paired": - if self._single_reads != ['']: - self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") - if self._left_reads == ['']: - self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") - if self._right_reads == ['']: - self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") - if self._right_reads == self._left_reads: - self._logAndRaise("ERROR: -l and -r options are same!") - - if self._type == "single": - if self._left_reads != ['']: - self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") - if self._right_reads != ['']: - self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") - if self._single_reads == ['']: - self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") - - if self._reference != "": - if not FileUtils.isRessourceExists(self._reference): - self._logAndRaise("ERROR: reference file %s does not exist!" % self._reference) - else: - self._logAndRaise("ERROR: No specified -f option!") - - if self._transcripts != "": - if not FileUtils.isRessourceExists(self._transcripts): - self._logAndRaise("ERROR: transcripts file %s does not exist!" % self._transcripts) - else: - self._logAndRaise("ERROR: No specified -g option!") - - if self._TE != "": - if not FileUtils.isRessourceExists(self._TE): - self._logAndRaise("ERROR: transposable elements %s does not exist!" % self._TE) - else: - self._logAndRaise("ERROR: No specified -t option!") - - - - def getTranscriptToBed(self, inputFile,outputFile): - try: - filewrite=open(outputFile, "w") - gtfParser = GtfParser(inputFile, assemblyTools=True) - for transcript in gtfParser.getIterator(): - if(transcript.getDirection()==1): - strand="+" - else: - strand="-" - filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\t%.3f\n" % (transcript.getChromosome(),transcript.getStart(), - transcript.getEnd(), transcript.getTagValue("ID"), transcript.getTagValue("gene_id"), - strand,float(transcript.getTagValue("FPKM")) )) - - filewrite.close() - except: - self._logAndRaise("Couldn't open %s for writing" % outputFile) - - - def getTEGFFToBed(self, inputFile, outputFile): - """TODO Dont write bed line when the strand is '.' - See Gtf parser option assemblyTools - """ - try: - filewrite=open(outputFile, "w") - gffParser = GffParser(inputFile) - for transcript in gffParser.getIterator(): - if(transcript.getDirection()==1): - strand="+" - else: - strand="-" - filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript.getChromosome(),transcript.getStart(), - transcript.getEnd(), transcript.getTagValue("ID").split("_")[0]+"_", transcript.getTagValue("Target").split("_")[0], strand) ) - - filewrite.close() - except: - self._logAndRaise("Couldn't open %s for writing" % outputFile) - - - def getTEnearPromoter (self, bedtoolsfile): - #### BEdParser.py in commons is not used because the format of this bed file is different. -# #Chrom starttr endtr transcript_id gene_ID strand fpkm chromte startte endte idte targetTE strandTE distance - #scaffold_1 37570 37785 GSSPFG00034586001-RA GSSPFG00034586001 + 0.0000000000 scaffold_1 33914 40164 ms162_ PotentialHostGene - 0 - - linelist = [] - tmplist = [] - with open(bedtoolsfile, "r") as bedFile: - for line in bedFile.readlines(): - m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line) - if(m != None): - start_TR = int(m.group(2))##F[1] - end_TR = int(m.group(3))##F[2] - strand_TR= m.group(6) ##[5] - start_TE = int(m.group(9))##[8] - end_TE = int(m.group(10))##[9] - dist = int(m.group(14))##[13] - if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0): - tmplist.append(line.strip()) - tmplist.append("TE_closest_TSS") - linelist.append(tmplist) - tmplist = [] - # F[1] gene F[2] - # =========================> - # ------------ - # F[8] F[9] - - if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0): - tmplist.append(line.strip()) - tmplist.append("TE_closest_TSS") - linelist.append(tmplist) - tmplist = [] - - # F[1] F[2] - # <====================== - # --------------- - - if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): - for i in range(0,len(line.split("\t"))-1): - tmplist.append(line.split("\t")[i]) - overlap = (end_TE-start_TR)+1 - tmplist.append(overlap) - tmplist.append("TE_overlap_TSS") - linelist.append(tmplist) - tmplist = [] - - # F[1] gene F[2] - # =========================> - # ------------- - # F[8] F[9] - - # gene - # F[1]=========================>F[2] - - # F[8]---------------F[9] - - - if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): - for i in range(0,len(line.split("\t"))-1): - tmplist.append(line.split("\t")[i]) - tmplist.append(0) - tmplist.append("TE_overlap_TSS") - linelist.append(tmplist) - tmplist = [] - - ## F[1]=============================>F[2] - ## F[8]---------------F[9] - - - if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): - for i in range(0,len(line.split("\t"))-1): - tmplist.append(line.split("\t")[i]) - overlap = (end_TR-start_TE)+1 - tmplist.append(overlap) - tmplist.append("TE_overlap_TSS") - linelist.append(tmplist) - tmplist = [] - - - # F[1]<======================F[2] - # --------------- - # F[8] F[9] - # - # - # F[1]<=============================F[2] - # F[8]---------------F[9] - - if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): - for i in range(0,len(line.split("\t"))-1): - tmplist.append(line.split("\t")[i]) - tmplist.append(0) - tmplist.append("TE_overlap_TSS") - linelist.append(tmplist) - tmplist = [] - - # F[1]<=============================F[2] - # F[8]---------------F[9] - - if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0): - tmplist.append(line.strip()) - tmplist.append("TE_in_gene") - linelist.append(tmplist) - tmplist = [] - - # F[1] gene F[2] - # ============================== - # ----------- - # F[8] F[9] - - - if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE): - for i in range(0,len(line.split("\t"))-1): - tmplist.append(line.split("\t")[i]) - lenTE = (end_TE-start_TE)+1 - tmplist.append(lenTE) - tmplist.append("gene_in_TE") - linelist.append(tmplist) - tmplist = [] - - # F[1]======================F[2] - # F[8]----------------------------------------------------F[9] - - - if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): - tmplist.append(line.strip()) - tmplist.append("gene_in_TE") - linelist.append(tmplist) - tmplist = [] - - # F[1]==================================>F[2] - # F[8]----------------------------------------------------------F[9] - - if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): - tmplist.append(line.strip()) - tmplist.append("gene_in_TE") - linelist.append(tmplist) - tmplist = [] - - # F[1]<==================================F[2] - # F[8]----------------------------------------------------------F[9] - - if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): - tmplist.append(line.strip()) - tmplist.append("gene_in_TE") - linelist.append(tmplist) - tmplist = [] - - # F[1]==================================>F[2] - # F[8]----------------------------------------------------------F[9] - - if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): - tmplist.append(line.strip()) - tmplist.append("gene_in_TE") - linelist.append(tmplist) - tmplist = [] - - # F[1]<==================================F[2] - # F[8]----------------------------------------------------------F[9] - - favorablecases = "%s_TSSoverlaps_and_TE_closest_TSS_and_inclus_ALL" % bedtoolsfile - w = open(favorablecases,'w') - for s in linelist: - line= "\t".join(str(item) for item in s) - w.write("%s\n" % line) - w.close() - - - def getClassCodeCuffcompare(self, tmap_file, bedtools_file): - class_code_dic = {} - lcode_ref = [] - tmp = [] - linetowrite =[] - with open(tmap_file) as tmap: - tmapline = tmap.readlines() - for i in range(1,len(tmapline)): - cuff_id = tmapline[i].split("\t")[4].strip() - class_code = tmapline[i].split("\t")[2].strip() - ref_id = tmapline[i].split("\t")[1].strip() - lcode_ref.append(class_code) - lcode_ref.append(ref_id) - class_code_dic[cuff_id] = lcode_ref - lcode_ref = [] - - with open(bedtools_file) as bedtools: - bedtoolsline = bedtools.readlines() - for i in xrange(0,len(bedtoolsline)): - tmp = bedtoolsline[i].strip().split("\t") - transcript_bedtools = bedtoolsline[i].split("\t")[3].strip() - if transcript_bedtools in class_code_dic.keys(): - tmp.append(class_code_dic[transcript_bedtools][0]) - tmp.append(class_code_dic[transcript_bedtools][1]) - else: - tmp.append("NA") - tmp.append("NA") - - linetowrite.append(tmp) - tmp=[] - - - output = "%s_with_Ref" % bedtools_file - w = open(output,'w') - line = "" - for i in xrange(0,len(linetowrite)): - for j in range(0,17): - line = line + linetowrite[i][j] + "\t" - w.write(line) - w.write("\n") - line = "" - w.close() - - def run(self): - self.checkOptions() - try: - LoggerFactory.setLevel(self._log, self._verbosity) - exeDir = os.getcwd() - workingDir = "out_TEiso_%s" % time.strftime("%Y%m%d%H%M%S") - if os.path.exists(workingDir): - self._logAndRaise("ERROR: %s already exists." % workingDir) - os.mkdir(workingDir) - referencefile = os.path.abspath(self._reference) - transcriptsfile = os.path.abspath(self._transcripts) - TEfile = os.path.abspath(self._TE) - print "workingDir >>>>> ",workingDir - os.symlink("%s" % os.path.abspath(self._reference), "%s/%s" % (workingDir, os.path.basename(self._reference))) - os.symlink("%s" % os.path.abspath(self._transcripts), "%s/%s" % (workingDir, os.path.basename(self._transcripts))) - os.chdir(workingDir) - bowtie_build_Dir = "bowtie_build" - prefixbowtie = os.path.basename(self._reference).split(".")[0] - iLaunchBowtie = Bowtie_build(referencefile, prefixbowtie, bowtie_build_Dir) - iLaunchBowtie.run() - os.chdir(exeDir) - self._log.info("Indexing genome is finished!!!!") - tophat_Dir = "tophat" - if self._type == "single": - l_single_reads = [] - for reads in range(0, len(self._single_reads)): - os.symlink("%s" % os.path.abspath(self._single_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._single_reads[reads]))) - filename = os.path.splitext(self._single_reads[reads])[0] - filetype = os.path.splitext(self._single_reads[reads])[1] - if filetype == ".gz": - os.system("gunzip -c %s > %s/%s" % (self._single_reads[reads], workingDir, os.path.basename(filename))) - if filetype == ".bz2": - os.system("bunzip2 -c %s > %s/%s" % (os.path.abspath(self._single_reads[reads]), workingDir, os.path.basename(filename))) - if filetype ==".fq": - filename = self._single_reads[reads] - l_single_reads.append("%s" % os.path.basename(filename)) - bowtiePrefix = "%s/%s" % (bowtie_build_Dir, prefixbowtie) - path = ("%s/%s") % (exeDir,workingDir) - os.chdir(path) - iLaunchTophat = Tophat(tophat_Dir, bowtiePrefix, self._type, l_single_reads, self._left_reads, self._right_reads) - iLaunchTophat.run() - if self._type == "paired": - l_left_reads = [] - l_right_reads = [] - for reads in range(0, len(self._left_reads)): - os.symlink("%s" % os.path.abspath(self._left_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._left_reads[reads]))) - filename = os.path.splitext(self._left_reads[reads])[0] - filetype = os.path.splitext(self._left_reads[reads])[1] - ##TODO : check type input file: mimetypes.guess_type(self._single_reads[reads]) - if filetype == ".gz": - os.system("gunzip -c %s > %s/%s" % (self._left_reads[reads],workingDir, os.path.basename(filename))) - if filetype == ".bz2": - os.system("bunzip2 -c %s > %s/%s" % (self._left_reads[reads],workingDir, os.path.basename(filename))) - if filetype ==".fq": - filename = self._left_reads[reads] - l_left_reads.append("%s" % os.path.basename(filename)) - - for reads in range(0, len(self._right_reads)): - os.symlink("%s" % os.path.abspath(self._right_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._right_reads[reads]))) - filename = os.path.splitext(self._right_reads[reads])[0] - filetype = os.path.splitext(self._right_reads[reads])[1] - if filetype == ".gz": - os.system("gunzip -c %s > %s/%s" % (self._right_reads[reads],workingDir, os.path.basename(filename))) - if filetype == ".bz2": - os.system("bunzip2 -c %s > %s/%s" % (self._right_reads[reads],workingDir, os.path.basename(filename))) - if filetype ==".fq": - filename = self._right_reads[reads] - l_right_reads.append("%s" % os.path.basename(filename)) - bowtiePrefix = "%s/%s" % (bowtie_build_Dir, prefixbowtie) - path= ("%s/%s") % (exeDir,workingDir) - os.chdir(path) - iLaunchTophat = Tophat(tophat_Dir, bowtiePrefix, self._type, self._single_reads, l_left_reads, l_right_reads) - iLaunchTophat.run() - self._log.info("Mapping reads is finished!!!!") - - if self._assembly_tool == "cufflinks": - cufflinks_Dir = "cufflinks" - accepted_hits = "%s/accepted_hits.bam" % tophat_Dir - iLaunchCufflinks = Cufflinks(accepted_hits, transcriptsfile , cufflinks_Dir) - iLaunchCufflinks.run() - self._log.info("%s is finished!!!!" % self._assembly_tool) - os.symlink("cufflinks/transcripts.gtf", "transcripts.gtf") - cuffcompare_Dir = "cuffcompare" - transcripts = "transcripts.gtf" - iLaunchCuffcompare = Cuffcompare(transcriptsfile, transcripts, outprefix = "cuffcompare", workingDir = cuffcompare_Dir) - iLaunchCuffcompare.run() - self._log.info("Cuffcompare is finished!!!!") - - - bedtools_closest_Dir = "bedtools_closest" - os.mkdir(bedtools_closest_Dir) - os.chdir(bedtools_closest_Dir) - - transcriptsgtf = "%s_transcripts.gtf" % self._assembly_tool - os.symlink("../%s/transcripts.gtf" % self._assembly_tool,transcriptsgtf) - transcriptsbed = "%s_transcripts.bed" % self._assembly_tool - self.getTranscriptToBed(transcriptsgtf,transcriptsbed) - TEgff = os.path.basename(os.path.splitext(TEfile)[0]) + ".gff3" - TEbed = os.path.basename(os.path.splitext(TEfile)[0]) + ".bed" - os.symlink("%s" % TEfile,TEgff) - self.getTEGFFToBed(TEgff,TEbed) - iLauncherBdc= Bedtools_closest(transcriptsbed, TEbed, "bedtools_closest_%s" % transcriptsgtf.split(".")[0]) - iLauncherBdc.run() - self._log.info("Bedtools closest is finished!!!!") - bedtoolsfile = "bedtools_closest_%s" % transcriptsgtf.split(".")[0] - self.getTEnearPromoter(bedtoolsfile) - tmap_file = "../cuffcompare/cuffcompare.transcripts.gtf.tmap" - bedtools_file = "%s_TSSoverlaps_and_TE_closest_TSS_and_inclus_ALL" % bedtoolsfile - - self.getClassCodeCuffcompare(tmap_file,bedtools_file) - os.chdir("..") - self._log.info("Done!!!!") - - except Exception: - self._logAndRaise("ERROR in TEiso") - - -if __name__ == "__main__": - iLaunch = LaunchTEiso() - iLaunch._setAttributesFromCmdLine() - iLaunch.run()