diff _modules/IData_handling.py @ 0:69e8f12c8b31 draft

"planemo upload"
author bioit_sciensano
date Fri, 11 Mar 2022 15:06:20 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/_modules/IData_handling.py	Fri Mar 11 15:06:20 2022 +0000
@@ -0,0 +1,339 @@
+## @file IData_handling.py
+#
+# VL: Gather here the classes and functions useful for handling input data.
+from __future__ import print_function
+
+import gzip
+from _modules.utilities import reverseComplement,changeCase
+from time import gmtime, strftime
+import datetime
+
+try:
+    import cPickle as pickle
+except ImportError:  # python 3.x
+    import pickle
+
+
+## This class encapsulates the reference sequences, the host sequence if any and all useful information about the sequences.
+#
+# It is used both for searching the read extracts in the sequences and for exploiting the results
+class refData:
+    def __init__(self,refseq_list,seed,hostseq):
+        self.refseq_list=refseq_list
+        self.seed=seed
+        self.hostseq=hostseq
+        if hostseq!="":
+            self.refseq_list.insert(0,hostseq)
+        self.nb_sequences=len(refseq_list)
+
+    def getIdxSeq(self,refseq):
+        idx=-1
+        found=False
+        for s in self.refseq_list:
+            idx += 1
+            if s==refseq:
+                found=True
+                break
+        if not found:
+            raise RuntimeError("Couldn't find sequence in list of ref sequences.")
+        return idx
+
+
+    def IdxIsHostseq(self,idx_seq):
+        if (((self.hostseq == "") and (idx_seq <= self.nb_sequences - 1)) or (
+            (self.hostseq != "") and (idx_seq >0))):
+            return False
+        return True
+
+    def getSeqSizesList(self):
+        seq_sizes_list = []
+        for seq in self.refseq_list:
+            seq_sizes_list.append(len(seq))
+        return seq_sizes_list
+
+
+## Base class for handling read extracts.
+#
+# This class should not be used directly.
+class ReadExtracts:
+    def __init__(self,seed):
+        self.seed = seed
+        self.r_extracts_list = []
+        self.nb_reads = 0
+        self.nb_extracts=0
+
+    ## Returns the list of read extracts from the loaded dataset, the number of reads and the total number of extracts
+    def getRExtracts(self):
+        return self.r_extracts_list,self.nb_reads,self.nb_extracts
+
+## Class containing all the read extracts (PE reads) that must be mapped against a sequence.
+class readExtractsPE(ReadExtracts):
+    def __init__(self,seed):
+        self.__init__(seed)
+
+
+    def addRead(self, whole_PE1,whole_PE2):
+        self.r_extracts_list.append(whole_PE1[:self.seed])
+        self.r_extracts_list.append(whole_PE1[-self.seed:])
+        self.r_extracts_list.append(whole_PE2[:self.seed])
+        self.r_extracts_list.append(reverseComplement(whole_PE2)[:self.seed])
+        self.r_extracts_list.append(reverseComplement(whole_PE2)[-self.seed:])
+        self.nb_reads += 1
+        self.nb_extracts += 5  # Number of extracts per read: 2 extracts for PE1 and 3 for PE2.
+
+
+
+## Class containing all the read extracts (single reads) that must be mapped against a sequence.
+class readsExtractsS(ReadExtracts):
+    def __init__(self,seed):
+        ReadExtracts.__init__(self,seed)
+
+    ## Adds a read to the list of extracts
+    #
+    # @param whole_read The read as extracted from the fastq file
+    # @param no_pair This paramenter s only used to make the distinction between Single and paired.
+    # Note VL: I didn't use meta programming here because I thought that it would have a negative impact on performance.
+    # TODO: test it when all the rest works.
+    def addRead(self,whole_read,no_pair=""):
+        read_part = whole_read[:self.seed]
+        self.r_extracts_list.append(read_part)
+        self.r_extracts_list.append(whole_read[-self.seed:])
+        self.r_extracts_list.append(reverseComplement(whole_read)[:self.seed])
+        self.r_extracts_list.append(reverseComplement(whole_read)[-self.seed:])
+        self.nb_reads+=1
+        self.nb_extracts += 4
+
+## use objects of this class to store read offset (PE1 and PE2) in files.
+class ReadInfo:
+    def __init__(self, off_PE1,whole_read,seed,off_PE2=None):
+        self.offset1=off_PE1
+        self.offset2=off_PE2
+        self.corlen = len(whole_read) - seed
+
+## Gets the number of reads in the fastq file
+# def getNbReads(fastq):
+#     with open(fastq) as f:
+#         for i, l in enumerate(f):
+#             pass
+#     nb_r=i+1
+#     nb_r=nb_r/4
+#     return nb_r
+
+
+
+## loads a chunk of reads for mapping on GPU.
+# Yields a ReadExtracts object plus a dictionnary of ReadInfo.
+# keeps in memory the parsing state of the file.
+# @param ch_size is in number of reads
+# @reset_ids indicates whether or not we want read index to be reset to 0 at the beginning of each chunk.
+def getChunk(fastq,seed,paired,ch_siz,reset_ids=True):
+    new_chunk = False
+    d_rinfo=dict()
+    idx_read=0
+    off2=None
+    filin = open(fastq)
+    line = filin.readline()
+    read_paired=""
+    if paired != "":
+        RE=readExtractsPE(seed)
+        filin_paired = open(paired)
+        line_paired = filin_paired.readline()
+    else:
+        RE=readsExtractsS(seed)
+
+    start = False
+    num_line=0
+    while line:
+        # Read sequence
+        read = line.split("\n")[0].split("\r")[0]
+        if paired != "":
+            read_paired = line_paired.split("\n")[0].split("\r")[0]
+        if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
+            start = True
+            off1=filin.tell()
+            line = filin.readline()
+            if paired != "":
+                off2=filin_paired.tell()
+                line_paired = filin_paired.readline()
+            continue
+        if (start == True):
+            start = False
+            readlen = len(read)
+            if readlen < seed:
+                line = filin.readline()
+                if paired !="":
+                    line_paired = filin_paired.readline() # also skip PE2 in that case
+                continue
+            RE.addRead(read,read_paired)
+            d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
+            if (idx_read>0 and ((idx_read+1)%(ch_siz)==0)):
+                yield RE,d_rinfo
+                if (reset_ids):
+                    idx_read=0
+                    new_chunk=True
+                if paired != "":
+                    RE = readExtractsPE(seed)
+                else:
+                    RE = readsExtractsS(seed)
+                d_rinfo = dict()
+            if not new_chunk:
+                idx_read+=1
+            else:
+                new_chunk=False
+
+        line = filin.readline()
+        if paired!="":
+            line_paired = filin_paired.readline()
+    filin.close()
+    if paired !="":
+        filin_paired.close()
+    yield RE, d_rinfo
+
+## dumps a dictionnary of ReadInfo objects indexed on read index.
+#
+# @param d_rinfo dictionnary to dump
+# @param fic filename (incl. full path) where to dump
+def dump_d_rinfo(d_rinfo,fic):
+    with open(fic, 'wb') as fp:
+        pickle.dump(d_rinfo, fp, protocol=pickle.HIGHEST_PROTOCOL)
+
+## Loads a dictionnary of ReadInfo objects.
+def load_d_rinfo(fic):
+    with open(fic, 'rb') as fp:
+        d_rinfo = pickle.load(fp)
+    return d_rinfo
+
+
+## loads all extracts of reads into a list for processing on GPU.
+#
+# returns 1 or 2 readExtracts objects plus a dictionnary of ReadInfo.
+def getAllReads(fastq,seed,paired):
+    d_rinfo=dict()
+    idx_read=0
+    off2=None
+    filin = open(fastq)
+    line = filin.readline()
+    read_paired=""
+
+    if paired != "":
+        RE=readExtractsPE(seed)
+        filin_paired = open(paired)
+        line_paired = filin_paired.readline()
+    else:
+        RE=readsExtractsS(seed)
+
+    start = False
+    num_line=0
+    while line:
+        # Read sequence
+        read = line.split("\n")[0].split("\r")[0]
+        if paired != "":
+            read_paired = line_paired.split("\n")[0].split("\r")[0]
+        if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
+            start = True
+            off1=filin.tell()
+            line = filin.readline()
+            if paired != "":
+                off2=filin_paired.tell()
+                line_paired = filin_paired.readline()
+            continue
+        if (start == True):
+            start = False
+            readlen = len(read)
+            if readlen < seed:
+                line = filin.readline()
+                if paired !="":
+                    line_paired = filin_paired.readline() # also skip PE2 in that case
+                continue
+            RE.addRead(read,read_paired)
+            d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
+            idx_read+=1
+
+        line = filin.readline()
+        if paired!="":
+            line_paired = filin_paired.readline()
+    filin.close()
+    if paired !="":
+        filin_paired.close()
+    return RE,d_rinfo
+
+## use this class to retrieve reads from fastq file.
+class ReadGetter:
+    ## constructor
+    #
+    # @param fastq Name of the fastq file that contains the read
+    # @param d_rinfo A dictionnary of ReadInfo objects that contains the offset indicating where the read starts in the file.
+    # @param paired The name of the file containing the PE2 (defaults to "").
+    def __init__(self,fastq,d_rinfo,paired=""):
+        self.filin=open(fastq)
+        self.d_rinfo=d_rinfo
+        self.paired=paired
+        if paired!="":
+            self.filinp=open(fastq)
+
+    def getOneRead(self,idx_read):
+        read_paired=""
+        self.filin.seek(self.d_rinfo[idx_read].offset1)
+        read=self.filin.readline()
+        if self.paired!="":
+            self.filinp.seek(self.d_rinfo[idx_read].offset2)
+            read_paired = self.filinp.readline()
+        return read,read_paired
+
+
+### READS Functions
+def totReads(filin):
+    """Verify and retrieve the number of reads in the fastq file before alignment"""
+    if filin.endswith('.gz'):
+        filein = gzip.open(filin, 'rb')
+    else:
+        filein = open(filin, 'r')
+
+    line = 0
+    while filein.readline():
+        line += 1
+    seq = float(round(line / 4))
+    filein.close()
+    return seq
+
+### SEQUENCE parsing function
+def genomeFastaRecovery(filin, limit_reference, edge, host_test = 0):
+    """Get genome sequence from fasta file"""
+    print("recovering genome from: ",filin)
+    print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
+    if filin == "":
+        return "", "", ""
+
+    infile = open(filin, 'r')
+    name = []
+    genome = []
+    genome_line = ""
+    genome_rejected = 0
+    for line in infile:
+        if line[0] == ">":
+            if name != []:
+                if len(genome_line) >= limit_reference:
+                    genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
+                else:
+                    genome_rejected += 1
+                    name = name[:-1]
+                genome_line = ""
+            name.append(line[1:].split('\r')[0].split('\n')[0])
+        else:
+            genome_line += changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0]
+
+    if len(genome_line) >= limit_reference:
+        genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
+        genome_line = ""
+    else:
+        genome_rejected += 1
+        name = name[:-1]
+
+    infile.close()
+
+    if host_test:
+        return "".join(genome)
+    else:
+        return genome, name, genome_rejected
+    close(filin)
+