Mercurial > repos > urgi-team > teiso
view TEisotools-1.0/TEiso/ClosestToStartSite.py @ 10:b6db3eaf35a5 draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 08:07:31 -0400 |
parents | 20ec0d14798e |
children |
line wrap: on
line source
#!/usr/bin/env python # Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. from commons.core.LoggerFactory import LoggerFactory from commons.core.utils.RepetOptionParser import RepetOptionParser from commons.core.checker.RepetException import RepetException from commons.core.utils.FileUtils import FileUtils import re,os LOG_NAME = "TEiso" class ClosestToStartSite(object): def __init__(self, inputFile = "", cuffcom_tmap = "", outputFile = "", verbosity = 3): self._inputFile = inputFile self._cuffcom_tmap = cuffcom_tmap self._outputFile = outputFile self._verbosity = verbosity self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) def setAttributesFromCmdLine(self): self._toolVersion = "1.0" description = "ClosestToStartSite version %s" % self._toolVersion epilog = "\nParser a bed file and create a bed file to create a report about positions of features A to features B. \n" epilog +="it can also add the class code of features A.\n" epilog += "example: ClosestToStartSite.py -i <inputFile> -c <cuff_in.tmap> -o <outputFile>\n" parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) parser.add_option("-i", "--inputFile", dest = "inputFile", action = "store", type = "string", help = "input bed file name.", default = "") parser.add_option("-c", "--cuffcom_tmap", dest = "cuffcom_tmap", action = "store", type = "string", help = "input gtf file of cuffcompare (<cuff_in>.tmap)", default = "") parser.add_option("-o", "--outputFile", dest = "outputFile", action = "store", type = "string", help = "output file name", default = "") parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 3]",default = 3) options = parser.parse_args()[0] self._setAttributesFromOptions(options) def _setAttributesFromOptions(self, options): self._inputFile = options.inputFile self._cuffcom_tmap = options.cuffcom_tmap self._outputFile = options.outputFile self._verbosity = options.verbosity def _logAndRaise(self, errorMsg): self._log.error(errorMsg) raise RepetException(errorMsg) def checkoption(self): if self._inputFile == "": self._log.info("Missing input file") else: if not FileUtils.isRessourceExists(self._inputFile): self._log.info("'%s' doesn't exist!" % self._inputFile) if self._cuffcom_tmap != "": if not FileUtils.isRessourceExists(self._cuffcom_tmap): self._log.info("'%s' doesn't exist!" % self._cuffcom_tmap) if self._outputFile == "": self._outputFile = "%s_Close2TSS_with_classcode.bed" % os.path.splitext(self._inputFile)[0] else: if FileUtils.isRessourceExists(self._outputFile): self._log.info("Output file '%s' already exists!" % self._outputFile) else: if self._outputFile == "": self._outputFile = "%s_Close2TSS.bed" % os.path.splitext(self._inputFile)[0] else: if FileUtils.isRessourceExists(self._outputFile): self._log.info("Output file '%s' already exists!" % self._outputFile) def getClassCodeCuffcompare(self, tmap_file, listPossitions): class_code_dic = {} lcode_ref = [] tmp = [] linetowrite =[] with open(tmap_file) as tmap: tmapline = tmap.readlines() for i in range(1,len(tmapline)): cuff_id = tmapline[i].split("\t")[4].strip() class_code = tmapline[i].split("\t")[2].strip() ref_id = tmapline[i].split("\t")[1].strip() lcode_ref.append(class_code) lcode_ref.append(ref_id) class_code_dic[cuff_id] = lcode_ref lcode_ref = [] for i in xrange(0,len(listPossitions)): tmp.extend(listPossitions[i]) transcript_bedtools = listPossitions[i][3] if transcript_bedtools in class_code_dic.keys(): tmp.append(class_code_dic[transcript_bedtools][0]) tmp.append(class_code_dic[transcript_bedtools][1]) else: tmp.append("NA") tmp.append("NA") linetowrite.append(tmp) tmp=[] return linetowrite def getClosestToStartSite(self, inputFile): linelist = [] tmplist = [] with open(inputFile, "r") as bedFile: for line in bedFile.readlines(): m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line) if(m != None): start_TR = int(m.group(2))##F[1] end_TR = int(m.group(3))##F[2] strand_TR= m.group(6) ##[5] start_TE = int(m.group(9))##[8] end_TE = int(m.group(10))##[9] dist = int(m.group(14))##[13] if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(line.split("\t")[13].strip()) tmplist.append("TE_closest_TSS") linelist.append(tmplist) tmplist = [] # F[1] gene F[2] # =========================> # ------------ # F[8] F[9] if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(line.split("\t")[13].strip()) tmplist.append("TE_closest_TSS") linelist.append(tmplist) tmplist = [] # F[1] F[2] # <====================== # --------------- if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) overlap = (end_TE-start_TR)+1 tmplist.append(overlap) tmplist.append("TE_overlap_TSS") linelist.append(tmplist) tmplist = [] # F[1] gene F[2] # =========================> # ------------- # F[8] F[9] # gene # F[1]=========================>F[2] # F[8]---------------F[9] if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) tmplist.append("TE_overlap_TSS") linelist.append(tmplist) tmplist = [] ## F[1]=============================>F[2] ## F[8]---------------F[9] if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) overlap = (end_TR-start_TE)+1 tmplist.append(overlap) tmplist.append("TE_overlap_TSS") linelist.append(tmplist) tmplist = [] # F[1]<======================F[2] # --------------- # F[8] F[9] # # # F[1]<=============================F[2] # F[8]---------------F[9] if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) tmplist.append("TE_overlap_TSS") linelist.append(tmplist) tmplist = [] # F[1]<=============================F[2] # F[8]---------------F[9] if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) #tmplist.append(line.strip()) tmplist.append("TE_in_gene") linelist.append(tmplist) tmplist = [] # F[1] gene F[2] # ============================== # ----------- # F[8] F[9] if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) #lenTE = (end_TE-start_TE)+1 tmplist.append(0) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]======================F[2] # F[8]----------------------------------------------------F[9] if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) #tmplist.append(line.strip()) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]==================================>F[2] # F[8]----------------------------------------------------------F[9] if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) #tmplist.append(line.strip()) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]<==================================F[2] # F[8]----------------------------------------------------------F[9] if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) #tmplist.append(line.strip()) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]==================================>F[2] # F[8]----------------------------------------------------------F[9] if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): for i in range(0,len(line.split("\t"))-1): tmplist.append(line.split("\t")[i]) tmplist.append(0) #tmplist.append(line.strip()) tmplist.append("gene_in_TE") linelist.append(tmplist) tmplist = [] # F[1]<==================================F[2] # F[8]----------------------------------------------------------F[9] return linelist def writeOutputFromList(self, listPossitions , outputFile): w = open(outputFile,'w') for s in listPossitions: line= "\t".join(str(item) for item in s) w.write("%s\n" % line) w.close() def run(self): self.checkoption() listPossitions = self.getClosestToStartSite(self._inputFile) if self._cuffcom_tmap == "": self.writeOutputFromList(listPossitions, self._outputFile ) else: listclasscode = self.getClassCodeCuffcompare(self._cuffcom_tmap, listPossitions) self.writeOutputFromList(listclasscode, self._outputFile) if __name__== "__main__": iClosestToStartSite = ClosestToStartSite() iClosestToStartSite.setAttributesFromCmdLine() iClosestToStartSite.run()