Mercurial > repos > urgi-team > teiso
view TEisotools-1.0/TEiso/LaunchTEiso.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python # Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. from commons.core.parsing.GtfParser import GtfParser from commons.core.parsing.GffParser import GffParser from TEiso.Bowtie_build import Bowtie_build from TEiso.Tophat import Tophat from TEiso.Cufflinks import Cufflinks from TEiso.Cuffcompare import Cuffcompare from TEiso.Bedtools_closest import Bedtools_closest from commons.core.LoggerFactory import LoggerFactory from commons.core.utils.RepetOptionParser import RepetOptionParser from commons.core.utils.FileUtils import FileUtils import os import time import re import sys LOG_NAME = "repet.TEiso" class LaunchTEiso(object): def __init__(self, reference="", input_transcripts="", single_paired="", single_reads="", left_reads="", right_reads="", transposable_element = "", assembly_tool="", verbosity=3): self._reference = reference self._transcripts = input_transcripts self._type = single_paired self._single_reads = single_reads.split(",") self._left_reads = left_reads.split(",") self._right_reads = right_reads.split(",") self._TE = transposable_element self._assembly_tool = assembly_tool self._verbosity = verbosity self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) def _setAttributesFromCmdLine(self): self._toolVersion = "0.1" description = "TEiso version %s" % self._toolVersion epilog = "\n if reads are single:\n LaunchTEiso.py -f <genome.fa> -g <transcripts.gtf> -e single -s <single_read> -t <TE.gff> -a cufflinks \n" epilog += " if reads are paired:\n LaunchTEiso.py -f <genome.fa> -g <transcripts.gtf> -e paired -l <reads_left> -r <reads_right> -t <TE.gff> -a cufflinks \n" parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) parser.add_option('-f' , '--input_reference' , dest='input_reference' , help='file with ref sequences') parser.add_option('-g' , '--input_transcripts', dest='input_transcripts', help='GTF/GFF with known transcripts' , default="") parser.add_option('-e' , '--single_paired' , dest='single_paired' , help='type of input reads, single or paired end', default="paired") parser.add_option('-s' , '--single_read' , dest='single_read' , help='a single input read' , default="") parser.add_option('-l', '--left_read' , dest='left_read' , help='left reads' , default="") parser.add_option('-r', '--right_read' , dest='right_read' , help='right reads' , default="") parser.add_option('-t' , '--input_transposable_element', dest='input_transposable_element', help='GFF with known transposable_element' , default="") parser.add_option('-a' , '--assembly_tool' , dest='assembly_tool' , help='type of RNA-Seq assembly tool' , default="cufflinks") options = parser.parse_args()[0] self.setAttributesFromOptions(options) def setAttributesFromOptions(self, options): self._reference = options.input_reference self._transcripts = options.input_transcripts self._type = options.single_paired self._single_reads = options.single_read.split(",") self._left_reads = options.left_read.split(",") self._right_reads = options.right_read.split(",") self._TE = options.input_transposable_element self._assembly_tool = options.assembly_tool def _logAndRaise(self, errorMsg): self._log.error(errorMsg) #raise Exception(errorMsg) sys.exit(1) def checkOptions(self): if self._type == "paired": if self._single_reads != ['']: self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") if self._left_reads == ['']: self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") if self._right_reads == ['']: self._logAndRaise("ERROR: for paired reads, you shoud use option left and right reads!") if self._right_reads == self._left_reads: self._logAndRaise("ERROR: -l and -r options are same!") if self._type == "single": if self._left_reads != ['']: self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") if self._right_reads != ['']: self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") if self._single_reads == ['']: self._logAndRaise("ERROR: for single reads, you shoud use option single reads!") if self._reference != "": if not FileUtils.isRessourceExists(self._reference): self._logAndRaise("ERROR: reference file %s does not exist!" % self._reference) else: self._logAndRaise("ERROR: No specified -f option!") if self._transcripts != "": if not FileUtils.isRessourceExists(self._transcripts): self._logAndRaise("ERROR: transcripts file %s does not exist!" % self._transcripts) else: self._logAndRaise("ERROR: No specified -g option!") if self._TE != "": if not FileUtils.isRessourceExists(self._TE): self._logAndRaise("ERROR: transposable elements %s does not exist!" % self._TE) else: self._logAndRaise("ERROR: No specified -t option!") def getTranscriptToBed(self, inputFile,outputFile): try: filewrite=open(outputFile, "w") gtfParser = GtfParser(inputFile, assemblyTools=True) for transcript in gtfParser.getIterator(): if(transcript.getDirection()==1): strand="+" else: strand="-" filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\t%.3f\n" % (transcript.getChromosome(),transcript.getStart(), transcript.getEnd(), transcript.getTagValue("ID"), transcript.getTagValue("gene_id"), strand,float(transcript.getTagValue("FPKM")) )) filewrite.close() except: self._logAndRaise("Couldn't open %s for writing" % outputFile) def getTEGFFToBed(self, inputFile, outputFile): """TODO Dont write bed line when the strand is '.' See Gtf parser option assemblyTools """ try: filewrite=open(outputFile, "w") gffParser = GffParser(inputFile) for transcript in gffParser.getIterator(): if(transcript.getDirection()==1): strand="+" else: strand="-" filewrite.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript.getChromosome(),transcript.getStart(), transcript.getEnd(), transcript.getTagValue("ID").split("_")[0]+"_", transcript.getTagValue("Target").split("_")[0], strand) ) filewrite.close() except: self._logAndRaise("Couldn't open %s for writing" % outputFile) def getTEnearPromoter (self, bedtoolsfile): #### BEdParser.py in commons is not used because the format of this bed file is different. # #Chrom starttr endtr transcript_id gene_ID strand fpkm chromte startte endte idte targetTE strandTE distance #scaffold_1 37570 37785 GSSPFG00034586001-RA GSSPFG00034586001 + 0.0000000000 scaffold_1 33914 40164 ms162_ PotentialHostGene - 0 linelist = [] tmplist = [] with open(bedtoolsfile, "r") as bedFile: for line in bedFile.readlines(): m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line) if(m != None): start_TR = int(m.group(2))##F[1] end_TR = int(m.group(3))##F[2] strand_TR= m.group(6) ##[5] start_TE = int(m.group(9))##[8] end_TE = int(m.group(10))##[9] dist = int(m.group(14))##[13] if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0): tmplist.append(line.strip()) tmplist.append("TE_closest_TSS") linelist.append(tmplist) tmplist = [] # F[1] gene F[2] # =========================> # ------------ # F[8] F[9] if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0): tmplist.append(line.strip()) tmplist.append("TE_closest_TSS") linelist.append(tmplist) tmplist = [] # F[1] F[2] # <====================== # --------------- if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) overlap = (end_TE-start_TR)+1 tmplist.append(overlap) tmplist.append("TE_overlap_TSS") linelist.append(tmplist) tmplist = [] # F[1] gene F[2] # =========================> # ------------- # F[8] F[9] # gene # F[1]=========================>F[2] # F[8]---------------F[9] if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) tmplist.append("TE_overlap_TSS") linelist.append(tmplist) tmplist = [] ## F[1]=============================>F[2] ## F[8]---------------F[9] if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) overlap = (end_TR-start_TE)+1 tmplist.append(overlap) tmplist.append("TE_overlap_TSS") linelist.append(tmplist) tmplist = [] # F[1]<======================F[2] # --------------- # F[8] F[9] # # # F[1]<=============================F[2] # F[8]---------------F[9] if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) tmplist.append("TE_overlap_TSS") linelist.append(tmplist) tmplist = [] # F[1]<=============================F[2] # F[8]---------------F[9] if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0): tmplist.append(line.strip()) tmplist.append("TE_in_gene") linelist.append(tmplist) tmplist = [] # F[1] gene F[2] # ============================== # ----------- # F[8] F[9] if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) lenTE = (end_TE-start_TE)+1 tmplist.append(lenTE) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]======================F[2] # F[8]----------------------------------------------------F[9] if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): tmplist.append(line.strip()) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]==================================>F[2] # F[8]----------------------------------------------------------F[9] if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): tmplist.append(line.strip()) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]<==================================F[2] # F[8]----------------------------------------------------------F[9] if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): tmplist.append(line.strip()) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]==================================>F[2] # F[8]----------------------------------------------------------F[9] if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): tmplist.append(line.strip()) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]<==================================F[2] # F[8]----------------------------------------------------------F[9] favorablecases = "%s_TSSoverlaps_and_TE_closest_TSS_and_inclus_ALL" % bedtoolsfile w = open(favorablecases,'w') for s in linelist: line= "\t".join(str(item) for item in s) w.write("%s\n" % line) w.close() def getClassCodeCuffcompare(self, tmap_file, bedtools_file): class_code_dic = {} lcode_ref = [] tmp = [] linetowrite =[] with open(tmap_file) as tmap: tmapline = tmap.readlines() for i in range(1,len(tmapline)): cuff_id = tmapline[i].split("\t")[4].strip() class_code = tmapline[i].split("\t")[2].strip() ref_id = tmapline[i].split("\t")[1].strip() lcode_ref.append(class_code) lcode_ref.append(ref_id) class_code_dic[cuff_id] = lcode_ref lcode_ref = [] with open(bedtools_file) as bedtools: bedtoolsline = bedtools.readlines() for i in xrange(0,len(bedtoolsline)): tmp = bedtoolsline[i].strip().split("\t") transcript_bedtools = bedtoolsline[i].split("\t")[3].strip() if transcript_bedtools in class_code_dic.keys(): tmp.append(class_code_dic[transcript_bedtools][0]) tmp.append(class_code_dic[transcript_bedtools][1]) else: tmp.append("NA") tmp.append("NA") linetowrite.append(tmp) tmp=[] output = "%s_with_Ref" % bedtools_file w = open(output,'w') line = "" for i in xrange(0,len(linetowrite)): for j in range(0,17): line = line + linetowrite[i][j] + "\t" w.write(line) w.write("\n") line = "" w.close() def run(self): self.checkOptions() try: LoggerFactory.setLevel(self._log, self._verbosity) exeDir = os.getcwd() workingDir = "out_TEiso_%s" % time.strftime("%Y%m%d%H%M%S") if os.path.exists(workingDir): self._logAndRaise("ERROR: %s already exists." % workingDir) os.mkdir(workingDir) referencefile = os.path.abspath(self._reference) transcriptsfile = os.path.abspath(self._transcripts) TEfile = os.path.abspath(self._TE) print "workingDir >>>>> ",workingDir os.symlink("%s" % os.path.abspath(self._reference), "%s/%s" % (workingDir, os.path.basename(self._reference))) os.symlink("%s" % os.path.abspath(self._transcripts), "%s/%s" % (workingDir, os.path.basename(self._transcripts))) os.chdir(workingDir) bowtie_build_Dir = "bowtie_build" prefixbowtie = os.path.basename(self._reference).split(".")[0] iLaunchBowtie = Bowtie_build(referencefile, prefixbowtie, bowtie_build_Dir) iLaunchBowtie.run() os.chdir(exeDir) self._log.info("Indexing genome is finished!!!!") tophat_Dir = "tophat" if self._type == "single": l_single_reads = [] for reads in range(0, len(self._single_reads)): os.symlink("%s" % os.path.abspath(self._single_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._single_reads[reads]))) filename = os.path.splitext(self._single_reads[reads])[0] filetype = os.path.splitext(self._single_reads[reads])[1] if filetype == ".gz": os.system("gunzip -c %s > %s/%s" % (self._single_reads[reads], workingDir, os.path.basename(filename))) if filetype == ".bz2": os.system("bunzip2 -c %s > %s/%s" % (os.path.abspath(self._single_reads[reads]), workingDir, os.path.basename(filename))) if filetype ==".fq": filename = self._single_reads[reads] l_single_reads.append("%s" % os.path.basename(filename)) bowtiePrefix = "%s/%s" % (bowtie_build_Dir, prefixbowtie) path = ("%s/%s") % (exeDir,workingDir) os.chdir(path) iLaunchTophat = Tophat(tophat_Dir, bowtiePrefix, self._type, l_single_reads, self._left_reads, self._right_reads) iLaunchTophat.run() if self._type == "paired": l_left_reads = [] l_right_reads = [] for reads in range(0, len(self._left_reads)): os.symlink("%s" % os.path.abspath(self._left_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._left_reads[reads]))) filename = os.path.splitext(self._left_reads[reads])[0] filetype = os.path.splitext(self._left_reads[reads])[1] ##TODO : check type input file: mimetypes.guess_type(self._single_reads[reads]) if filetype == ".gz": os.system("gunzip -c %s > %s/%s" % (self._left_reads[reads],workingDir, os.path.basename(filename))) if filetype == ".bz2": os.system("bunzip2 -c %s > %s/%s" % (self._left_reads[reads],workingDir, os.path.basename(filename))) if filetype ==".fq": filename = self._left_reads[reads] l_left_reads.append("%s" % os.path.basename(filename)) for reads in range(0, len(self._right_reads)): os.symlink("%s" % os.path.abspath(self._right_reads[reads]), "%s/%s" % (workingDir, os.path.basename(self._right_reads[reads]))) filename = os.path.splitext(self._right_reads[reads])[0] filetype = os.path.splitext(self._right_reads[reads])[1] if filetype == ".gz": os.system("gunzip -c %s > %s/%s" % (self._right_reads[reads],workingDir, os.path.basename(filename))) if filetype == ".bz2": os.system("bunzip2 -c %s > %s/%s" % (self._right_reads[reads],workingDir, os.path.basename(filename))) if filetype ==".fq": filename = self._right_reads[reads] l_right_reads.append("%s" % os.path.basename(filename)) bowtiePrefix = "%s/%s" % (bowtie_build_Dir, prefixbowtie) path= ("%s/%s") % (exeDir,workingDir) os.chdir(path) iLaunchTophat = Tophat(tophat_Dir, bowtiePrefix, self._type, self._single_reads, l_left_reads, l_right_reads) iLaunchTophat.run() self._log.info("Mapping reads is finished!!!!") if self._assembly_tool == "cufflinks": cufflinks_Dir = "cufflinks" accepted_hits = "%s/accepted_hits.bam" % tophat_Dir iLaunchCufflinks = Cufflinks(accepted_hits, transcriptsfile , cufflinks_Dir) iLaunchCufflinks.run() self._log.info("%s is finished!!!!" % self._assembly_tool) os.symlink("cufflinks/transcripts.gtf", "transcripts.gtf") cuffcompare_Dir = "cuffcompare" transcripts = "transcripts.gtf" iLaunchCuffcompare = Cuffcompare(transcriptsfile, transcripts, outprefix = "cuffcompare", workingDir = cuffcompare_Dir) iLaunchCuffcompare.run() self._log.info("Cuffcompare is finished!!!!") bedtools_closest_Dir = "bedtools_closest" os.mkdir(bedtools_closest_Dir) os.chdir(bedtools_closest_Dir) transcriptsgtf = "%s_transcripts.gtf" % self._assembly_tool os.symlink("../%s/transcripts.gtf" % self._assembly_tool,transcriptsgtf) transcriptsbed = "%s_transcripts.bed" % self._assembly_tool self.getTranscriptToBed(transcriptsgtf,transcriptsbed) TEgff = os.path.basename(os.path.splitext(TEfile)[0]) + ".gff3" TEbed = os.path.basename(os.path.splitext(TEfile)[0]) + ".bed" os.symlink("%s" % TEfile,TEgff) self.getTEGFFToBed(TEgff,TEbed) iLauncherBdc= Bedtools_closest(transcriptsbed, TEbed, "bedtools_closest_%s" % transcriptsgtf.split(".")[0]) iLauncherBdc.run() self._log.info("Bedtools closest is finished!!!!") bedtoolsfile = "bedtools_closest_%s" % transcriptsgtf.split(".")[0] self.getTEnearPromoter(bedtoolsfile) tmap_file = "../cuffcompare/cuffcompare.transcripts.gtf.tmap" bedtools_file = "%s_TSSoverlaps_and_TE_closest_TSS_and_inclus_ALL" % bedtoolsfile self.getClassCodeCuffcompare(tmap_file,bedtools_file) os.chdir("..") self._log.info("Done!!!!") except Exception: self._logAndRaise("ERROR in TEiso") if __name__ == "__main__": iLaunch = LaunchTEiso() iLaunch._setAttributesFromCmdLine() iLaunch.run()