diff TEisotools-1.1.a/TEiso/ClosestToStartSite.py @ 16:836ce3d9d47a draft default tip

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:42:47 -0400
parents 255c852351c5
children
line wrap: on
line diff
--- a/TEisotools-1.1.a/TEiso/ClosestToStartSite.py	Thu Jul 21 07:36:44 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,333 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright INRA (Institut National de la Recherche Agronomique)
-# http://www.inra.fr
-# http://urgi.versailles.inra.fr
-#
-# This software is governed by the CeCILL license under French law and
-# abiding by the rules of distribution of free software.  You can  use,
-# modify and/ or redistribute the software under the terms of the CeCILL
-# license as circulated by CEA, CNRS and INRIA at the following URL
-# "http://www.cecill.info".
-#
-# As a counterpart to the access to the source code and  rights to copy,
-# modify and redistribute granted by the license, users are provided only
-# with a limited warranty  and the software's author,  the holder of the
-# economic rights,  and the successive licensors  have only  limited
-# liability.
-#
-# In this respect, the user's attention is drawn to the risks associated
-# with loading,  using,  modifying and/or developing or reproducing the
-# software by the user in light of its specific status of free software,
-# that may mean  that it is complicated to manipulate,  and  that  also
-# therefore means  that it is reserved for developers  and  experienced
-# professionals having in-depth computer knowledge. Users are therefore
-# encouraged to load and test the software's suitability as regards their
-# requirements in conditions enabling the security of their systems and/or
-# data to be ensured and,  more generally, to use and operate it in the
-# same conditions as regards security.
-#
-# The fact that you are presently reading this means that you have had
-# knowledge of the CeCILL license and that you accept its terms.
-
-from commons.core.LoggerFactory             import LoggerFactory
-from commons.core.utils.RepetOptionParser   import RepetOptionParser
-from commons.core.checker.RepetException    import RepetException
-from commons.core.utils.FileUtils           import FileUtils
-import re,os
-LOG_NAME = "TEiso"
-
-class ClosestToStartSite(object):
-    
-    def __init__(self, inputFile = "", cuffcom_tmap =   "", outputFile = "", verbosity = 3):
-        self._inputFile = inputFile
-        self._cuffcom_tmap = cuffcom_tmap
-        self._outputFile = outputFile
-        self._verbosity = verbosity
-        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity)
-            
-    def setAttributesFromCmdLine(self):
-        self._toolVersion = "1.1.a"
-        description = "ClosestToStartSite version %s" % self._toolVersion
-        epilog = "\nParser a bed file and create a bed file to create a report about positions of features A to features B. \n"
-        epilog +="it can also add the class code of features A.\n"
-        epilog += "example: ClosestToStartSite.py -i <inputFile> -c <cuff_in.tmap> -o <outputFile>\n"
-        parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) 
-        parser.add_option("-i", "--inputFile",  dest = "inputFile",  action = "store", type = "string", help = "input bed file name.",  default = "")
-        parser.add_option("-c", "--cuffcom_tmap",  dest = "cuffcom_tmap",  action = "store", type = "string", help = "input gtf file of cuffcompare (<cuff_in>.tmap)",  default = "")
-        parser.add_option("-o", "--outputFile", dest = "outputFile", action = "store", type = "string", help = "output file name", default = "")
-        parser.add_option("-v", "--verbosity",  dest = "verbosity",  action = "store", type = "int",    help = "verbosity [optional] [default: 3]",default = 3)
-        options = parser.parse_args()[0]
-        self._setAttributesFromOptions(options)
-
-    def _setAttributesFromOptions(self, options):
-        self._inputFile  = options.inputFile
-        self._cuffcom_tmap = options.cuffcom_tmap
-        self._outputFile = options.outputFile
-        self._verbosity  = options.verbosity
-                    
-    def _logAndRaise(self, errorMsg):
-        self._log.error(errorMsg)
-        raise RepetException(errorMsg)
-
-    def checkoption(self):
-        if self._inputFile == "":
-            self._log.info("Missing input file")
-        else:
-            if not FileUtils.isRessourceExists(self._inputFile):
-                self._log.info("'%s' doesn't exist!" % self._inputFile)	
-
-        if self._cuffcom_tmap != "":
-            if not FileUtils.isRessourceExists(self._cuffcom_tmap):
-                self._log.info("'%s' doesn't exist!" % self._cuffcom_tmap)
-	    if self._outputFile == "":
-		self._outputFile = "%s_Close2TSS_with_classcode.bed" % os.path.splitext(self._inputFile)[0]
-    	    else:
-           	if FileUtils.isRessourceExists(self._outputFile):
-                	self._log.info("Output file '%s' already exists!" % self._outputFile)
-        else:
-           if self._outputFile == "":
-                self._outputFile = "%s_Close2TSS.bed" % os.path.splitext(self._inputFile)[0]
-           else:
-           	if FileUtils.isRessourceExists(self._outputFile):
-                	self._log.info("Output file '%s' already exists!" % self._outputFile)
-		
-            
-    def getClassCodeCuffcompare(self, tmap_file, listPossitions):
-        class_code_dic = {}
-        lcode_ref = []
-        tmp = []
-        linetowrite =[]
-        with open(tmap_file) as tmap:
-                tmapline = tmap.readlines()
-                for i in range(1,len(tmapline)):
-                    cuff_id = tmapline[i].split("\t")[4].strip()
-                    class_code = tmapline[i].split("\t")[2].strip()
-                    ref_id = tmapline[i].split("\t")[1].strip()
-                    lcode_ref.append(class_code)
-                    lcode_ref.append(ref_id)
-                    class_code_dic[cuff_id] = lcode_ref
-                    lcode_ref = []
-        
-        
-        for i in xrange(0,len(listPossitions)):
-            tmp.extend(listPossitions[i])
-            transcript_bedtools = listPossitions[i][3]
-                
-            if transcript_bedtools in class_code_dic.keys():
-                    tmp.append(class_code_dic[transcript_bedtools][0])
-                    tmp.append(class_code_dic[transcript_bedtools][1])
-            else:
-                    tmp.append("NA")
-                    tmp.append("NA")
-            linetowrite.append(tmp)
-            tmp=[]
-        return linetowrite
-        
-
-           
-    def getClosestToStartSite(self, inputFile):
-        linelist = []
-        tmplist = []
-        with open(inputFile, "r") as bedFile:
-            for line in bedFile.readlines():
-                m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line)
-                if(m != None):
-                    start_TR = int(m.group(2))##F[1]
-                    end_TR = int(m.group(3))##F[2]
-                    strand_TR= m.group(6) ##[5]
-                    start_TE = int(m.group(9))##[8]
-                    end_TE = int(m.group(10))##[9]
-                    dist = int(m.group(14))##[13]
-                    if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(line.split("\t")[13].strip())
-                        tmplist.append("TE_closest_TSS")
-                        linelist.append(tmplist)
-                        tmplist = []
-            #                        F[1]   gene             F[2]
-            #                        =========================>
-            #         ------------
-            #        F[8]        F[9]
-    
-                    if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(line.split("\t")[13].strip())
-                        tmplist.append("TE_closest_TSS")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            #                        F[1]                F[2]
-            #                        <======================
-            #                                                    ---------------
-                  
-                    if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        overlap = (end_TE-start_TR)+1
-                        tmplist.append(overlap)
-                        tmplist.append("TE_overlap_TSS")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            #                        F[1]   gene             F[2]
-            #                        =========================>
-            #                 -------------
-            #                F[8]        F[9]
-            
-            #                                         gene
-            #                                F[1]=========================>F[2]
-            
-            #                                F[8]---------------F[9]
-            
-            
-                    if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(0)
-                        tmplist.append("TE_overlap_TSS")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            ##                                F[1]=============================>F[2]
-            ##                 F[8]---------------F[9]
-            
-            
-                    if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        overlap = (end_TR-start_TE)+1
-                        tmplist.append(overlap)
-                        tmplist.append("TE_overlap_TSS")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            
-            #                        F[1]<======================F[2]
-            #                                           ---------------
-            #                                        F[8]           F[9]
-            #
-            #
-            #                    F[1]<=============================F[2]
-            #                                   F[8]---------------F[9]
-            
-                    if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(0)
-                        tmplist.append("TE_overlap_TSS")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            #                                                F[1]<=============================F[2]
-            #                                                                                F[8]---------------F[9]
-            
-                    if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(0)
-                        #tmplist.append(line.strip())
-                        tmplist.append("TE_in_gene")
-                        linelist.append(tmplist)
-                        tmplist = []
-                  
-            
-            #                        F[1]            gene          F[2]
-            #                        ==============================
-            #                                 -----------
-            #                                F[8]        F[9]
-            
-            
-                    if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        #lenTE = (end_TE-start_TE)+1
-                        tmplist.append(0)
-                        tmplist.append("gene_in_TE")
-                        linelist.append(tmplist)
-                        tmplist = []
-                        
-            #                        F[1]======================F[2]
-            #        F[8]----------------------------------------------------F[9]
-            
-            
-                    if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(0)
-                        #tmplist.append(line.strip())
-                        tmplist.append("gene_in_TE")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            #                                                                F[1]==================================>F[2]
-            #                                        F[8]----------------------------------------------------------F[9]
-            
-                    if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(0)
-                        #tmplist.append(line.strip())
-                        tmplist.append("gene_in_TE")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            #                                                                F[1]<==================================F[2]
-            #                                        F[8]----------------------------------------------------------F[9]
-            
-                    if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(0)
-                        #tmplist.append(line.strip())
-                        tmplist.append("gene_in_TE")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            #                                                                F[1]==================================>F[2]
-            #                                                                F[8]----------------------------------------------------------F[9]
-            
-                    if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR):
-                        for i in range(0,len(line.split("\t"))-1):
-                            tmplist.append(line.split("\t")[i])
-                        tmplist.append(0)
-                        #tmplist.append(line.strip())
-                        tmplist.append("gene_in_TE")
-                        linelist.append(tmplist)
-                        tmplist = []
-            
-            #                                                                F[1]<==================================F[2]
-            #                                                                F[8]----------------------------------------------------------F[9]
-        
-        return linelist
-    
-    
-    def writeOutputFromList(self, listPossitions , outputFile):
-        w = open(outputFile,'w')
-        for s in listPossitions:
-            line= "\t".join(str(item) for item in s)
-            w.write("%s\n" % line)
-        w.close()
-
-                
-    def run(self):
-        self.checkoption()
-        listPossitions = self.getClosestToStartSite(self._inputFile)
-        if self._cuffcom_tmap == "":
-            self.writeOutputFromList(listPossitions, self._outputFile )
-        else:
-            listclasscode = self.getClassCodeCuffcompare(self._cuffcom_tmap, listPossitions)
-            self.writeOutputFromList(listclasscode, self._outputFile)
-                
-if __name__== "__main__":
-    iClosestToStartSite = ClosestToStartSite()
-    iClosestToStartSite.setAttributesFromCmdLine()
-    iClosestToStartSite.run()
-
-
-
-
-
-