Mercurial > repos > bioit_sciensano > phagetermvirome
diff _modules/IData_handling.py @ 0:69e8f12c8b31 draft
"planemo upload"
author | bioit_sciensano |
---|---|
date | Fri, 11 Mar 2022 15:06:20 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_modules/IData_handling.py Fri Mar 11 15:06:20 2022 +0000 @@ -0,0 +1,339 @@ +## @file IData_handling.py +# +# VL: Gather here the classes and functions useful for handling input data. +from __future__ import print_function + +import gzip +from _modules.utilities import reverseComplement,changeCase +from time import gmtime, strftime +import datetime + +try: + import cPickle as pickle +except ImportError: # python 3.x + import pickle + + +## This class encapsulates the reference sequences, the host sequence if any and all useful information about the sequences. +# +# It is used both for searching the read extracts in the sequences and for exploiting the results +class refData: + def __init__(self,refseq_list,seed,hostseq): + self.refseq_list=refseq_list + self.seed=seed + self.hostseq=hostseq + if hostseq!="": + self.refseq_list.insert(0,hostseq) + self.nb_sequences=len(refseq_list) + + def getIdxSeq(self,refseq): + idx=-1 + found=False + for s in self.refseq_list: + idx += 1 + if s==refseq: + found=True + break + if not found: + raise RuntimeError("Couldn't find sequence in list of ref sequences.") + return idx + + + def IdxIsHostseq(self,idx_seq): + if (((self.hostseq == "") and (idx_seq <= self.nb_sequences - 1)) or ( + (self.hostseq != "") and (idx_seq >0))): + return False + return True + + def getSeqSizesList(self): + seq_sizes_list = [] + for seq in self.refseq_list: + seq_sizes_list.append(len(seq)) + return seq_sizes_list + + +## Base class for handling read extracts. +# +# This class should not be used directly. +class ReadExtracts: + def __init__(self,seed): + self.seed = seed + self.r_extracts_list = [] + self.nb_reads = 0 + self.nb_extracts=0 + + ## Returns the list of read extracts from the loaded dataset, the number of reads and the total number of extracts + def getRExtracts(self): + return self.r_extracts_list,self.nb_reads,self.nb_extracts + +## Class containing all the read extracts (PE reads) that must be mapped against a sequence. +class readExtractsPE(ReadExtracts): + def __init__(self,seed): + self.__init__(seed) + + + def addRead(self, whole_PE1,whole_PE2): + self.r_extracts_list.append(whole_PE1[:self.seed]) + self.r_extracts_list.append(whole_PE1[-self.seed:]) + self.r_extracts_list.append(whole_PE2[:self.seed]) + self.r_extracts_list.append(reverseComplement(whole_PE2)[:self.seed]) + self.r_extracts_list.append(reverseComplement(whole_PE2)[-self.seed:]) + self.nb_reads += 1 + self.nb_extracts += 5 # Number of extracts per read: 2 extracts for PE1 and 3 for PE2. + + + +## Class containing all the read extracts (single reads) that must be mapped against a sequence. +class readsExtractsS(ReadExtracts): + def __init__(self,seed): + ReadExtracts.__init__(self,seed) + + ## Adds a read to the list of extracts + # + # @param whole_read The read as extracted from the fastq file + # @param no_pair This paramenter s only used to make the distinction between Single and paired. + # Note VL: I didn't use meta programming here because I thought that it would have a negative impact on performance. + # TODO: test it when all the rest works. + def addRead(self,whole_read,no_pair=""): + read_part = whole_read[:self.seed] + self.r_extracts_list.append(read_part) + self.r_extracts_list.append(whole_read[-self.seed:]) + self.r_extracts_list.append(reverseComplement(whole_read)[:self.seed]) + self.r_extracts_list.append(reverseComplement(whole_read)[-self.seed:]) + self.nb_reads+=1 + self.nb_extracts += 4 + +## use objects of this class to store read offset (PE1 and PE2) in files. +class ReadInfo: + def __init__(self, off_PE1,whole_read,seed,off_PE2=None): + self.offset1=off_PE1 + self.offset2=off_PE2 + self.corlen = len(whole_read) - seed + +## Gets the number of reads in the fastq file +# def getNbReads(fastq): +# with open(fastq) as f: +# for i, l in enumerate(f): +# pass +# nb_r=i+1 +# nb_r=nb_r/4 +# return nb_r + + + +## loads a chunk of reads for mapping on GPU. +# Yields a ReadExtracts object plus a dictionnary of ReadInfo. +# keeps in memory the parsing state of the file. +# @param ch_size is in number of reads +# @reset_ids indicates whether or not we want read index to be reset to 0 at the beginning of each chunk. +def getChunk(fastq,seed,paired,ch_siz,reset_ids=True): + new_chunk = False + d_rinfo=dict() + idx_read=0 + off2=None + filin = open(fastq) + line = filin.readline() + read_paired="" + if paired != "": + RE=readExtractsPE(seed) + filin_paired = open(paired) + line_paired = filin_paired.readline() + else: + RE=readsExtractsS(seed) + + start = False + num_line=0 + while line: + # Read sequence + read = line.split("\n")[0].split("\r")[0] + if paired != "": + read_paired = line_paired.split("\n")[0].split("\r")[0] + if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read. + start = True + off1=filin.tell() + line = filin.readline() + if paired != "": + off2=filin_paired.tell() + line_paired = filin_paired.readline() + continue + if (start == True): + start = False + readlen = len(read) + if readlen < seed: + line = filin.readline() + if paired !="": + line_paired = filin_paired.readline() # also skip PE2 in that case + continue + RE.addRead(read,read_paired) + d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2) + if (idx_read>0 and ((idx_read+1)%(ch_siz)==0)): + yield RE,d_rinfo + if (reset_ids): + idx_read=0 + new_chunk=True + if paired != "": + RE = readExtractsPE(seed) + else: + RE = readsExtractsS(seed) + d_rinfo = dict() + if not new_chunk: + idx_read+=1 + else: + new_chunk=False + + line = filin.readline() + if paired!="": + line_paired = filin_paired.readline() + filin.close() + if paired !="": + filin_paired.close() + yield RE, d_rinfo + +## dumps a dictionnary of ReadInfo objects indexed on read index. +# +# @param d_rinfo dictionnary to dump +# @param fic filename (incl. full path) where to dump +def dump_d_rinfo(d_rinfo,fic): + with open(fic, 'wb') as fp: + pickle.dump(d_rinfo, fp, protocol=pickle.HIGHEST_PROTOCOL) + +## Loads a dictionnary of ReadInfo objects. +def load_d_rinfo(fic): + with open(fic, 'rb') as fp: + d_rinfo = pickle.load(fp) + return d_rinfo + + +## loads all extracts of reads into a list for processing on GPU. +# +# returns 1 or 2 readExtracts objects plus a dictionnary of ReadInfo. +def getAllReads(fastq,seed,paired): + d_rinfo=dict() + idx_read=0 + off2=None + filin = open(fastq) + line = filin.readline() + read_paired="" + + if paired != "": + RE=readExtractsPE(seed) + filin_paired = open(paired) + line_paired = filin_paired.readline() + else: + RE=readsExtractsS(seed) + + start = False + num_line=0 + while line: + # Read sequence + read = line.split("\n")[0].split("\r")[0] + if paired != "": + read_paired = line_paired.split("\n")[0].split("\r")[0] + if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read. + start = True + off1=filin.tell() + line = filin.readline() + if paired != "": + off2=filin_paired.tell() + line_paired = filin_paired.readline() + continue + if (start == True): + start = False + readlen = len(read) + if readlen < seed: + line = filin.readline() + if paired !="": + line_paired = filin_paired.readline() # also skip PE2 in that case + continue + RE.addRead(read,read_paired) + d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2) + idx_read+=1 + + line = filin.readline() + if paired!="": + line_paired = filin_paired.readline() + filin.close() + if paired !="": + filin_paired.close() + return RE,d_rinfo + +## use this class to retrieve reads from fastq file. +class ReadGetter: + ## constructor + # + # @param fastq Name of the fastq file that contains the read + # @param d_rinfo A dictionnary of ReadInfo objects that contains the offset indicating where the read starts in the file. + # @param paired The name of the file containing the PE2 (defaults to ""). + def __init__(self,fastq,d_rinfo,paired=""): + self.filin=open(fastq) + self.d_rinfo=d_rinfo + self.paired=paired + if paired!="": + self.filinp=open(fastq) + + def getOneRead(self,idx_read): + read_paired="" + self.filin.seek(self.d_rinfo[idx_read].offset1) + read=self.filin.readline() + if self.paired!="": + self.filinp.seek(self.d_rinfo[idx_read].offset2) + read_paired = self.filinp.readline() + return read,read_paired + + +### READS Functions +def totReads(filin): + """Verify and retrieve the number of reads in the fastq file before alignment""" + if filin.endswith('.gz'): + filein = gzip.open(filin, 'rb') + else: + filein = open(filin, 'r') + + line = 0 + while filein.readline(): + line += 1 + seq = float(round(line / 4)) + filein.close() + return seq + +### SEQUENCE parsing function +def genomeFastaRecovery(filin, limit_reference, edge, host_test = 0): + """Get genome sequence from fasta file""" + print("recovering genome from: ",filin) + print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) + if filin == "": + return "", "", "" + + infile = open(filin, 'r') + name = [] + genome = [] + genome_line = "" + genome_rejected = 0 + for line in infile: + if line[0] == ">": + if name != []: + if len(genome_line) >= limit_reference: + genome.append(genome_line[-edge:] + genome_line + genome_line[:edge]) + else: + genome_rejected += 1 + name = name[:-1] + genome_line = "" + name.append(line[1:].split('\r')[0].split('\n')[0]) + else: + genome_line += changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0] + + if len(genome_line) >= limit_reference: + genome.append(genome_line[-edge:] + genome_line + genome_line[:edge]) + genome_line = "" + else: + genome_rejected += 1 + name = name[:-1] + + infile.close() + + if host_test: + return "".join(genome) + else: + return genome, name, genome_rejected + close(filin) +