Mercurial > repos > weilong-guo > bs_seeker2

import fileinput, os, time, random, math
from bs_utils.utils import *
from bs_align_utils import *

#----------------------------------------------------------------
# Read from the mapped results, return lists of unique / multiple-hit reads
# The function suppose at most 2 hits will be reported in single file
def extract_mapping(ali_file):
    unique_hits = {}
    non_unique_hits = {}

    header0 = ""
    lst = []

    for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file):
        #------------------------------
        if header != header0:
            #---------- output -----------
            if len(lst) == 1:
                unique_hits[header0] = lst[0]      # [no_mismatch, chr, location]
            elif len(lst) > 1:
                min_lst = min(lst, key = lambda x: x[0])
                max_lst = max(lst, key = lambda x: x[0])

                if min_lst[0] < max_lst[0]:
                    unique_hits[header0] = min_lst
                else:
                    non_unique_hits[header0] = min_lst[0]
                    #print "multiple hit", header, chr, location, no_mismatch, cigar # test
            header0 = header
            lst = [(no_mismatch, chr, location, cigar)]
        else: # header == header0, same header (read id)
            lst.append((no_mismatch, chr, location, cigar))

    if len(lst) == 1:
        unique_hits[header0] = lst[0]      # [no_mismatch, chr, location]
    elif len(lst) > 1:
        min_lst = min(lst, key = lambda x: x[0])
        max_lst = max(lst, key = lambda x: x[0])

        if min_lst[0] < max_lst[0]:
            unique_hits[header0] = min_lst
        else:
            non_unique_hits[header0] = min_lst[0]


    return unique_hits, non_unique_hits


def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines,
                  max_mismatch_no, aligner_command, db_path, tmp_path, outfile,
                  XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False):
    #----------------------------------------------------------------
    # adapter : strand-specific or not
    adapter=""
    adapter_fw=""
    adapter_rc=""
    if adapter_file !="":
        try :
            adapter_inf=open(adapter_file,"r")
        except IOError:
            print "[Error] Cannot open adapter file : %s" % adapter_file
            exit(-1)
        if asktag == "N": #<--- directional library
            adapter=adapter_inf.readline()
            adapter_inf.close()
            adapter=adapter.rstrip("\n")
        elif asktag == "Y":#<--- un-directional library
            adapter_fw=adapter_inf.readline()
            adapter_rc=adapter_inf.readline()
            adapter_inf.close()
            adapter_fw=adapter_fw.rstrip("\n")
            adapter_rc=adapter_rc.rstrip("\n")
        adapter_inf.close()
    #----------------------------------------------------------------


    #----------------------------------------------------------------
    logm("Read filename: %s"% main_read_file )
    logm("Un-directional library: %s" % asktag )
    logm("The first base (for mapping): %d" % cut1)
    logm("The last base (for mapping): %d" % cut2)
    logm("Max. lines per mapping: %d"% no_small_lines)
    logm("Aligner: %s" % aligner_command)
    logm("Reference genome library path: %s" % db_path )
    logm("Number of mismatches allowed: %s" % max_mismatch_no )
    if adapter_file !="":
        if asktag=="N":
            logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n")))
        elif asktag=="Y":
            logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) )
            logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) )
    #----------------------------------------------------------------

    # helper method to join fname with tmp_path
    tmp_d = lambda fname: os.path.join(tmp_path, fname)

    db_d = lambda fname:  os.path.join(db_path, fname)

    #----------------------------------------------------------------
    # splitting the big read file

    input_fname = os.path.split(main_read_file)[1]

#    split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines)
#    my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path)
#                                            if splitted_file.startswith("%s-s-" % input_fname))

    #---- Stats ------------------------------------------------------------
    all_raw_reads=0
    all_trimed=0
    all_mapped=0
    all_mapped_passed=0

    numbers_premapped_lst=[0,0,0,0]
    numbers_mapped_lst=[0,0,0,0]

    mC_lst=[0,0,0]
    uC_lst=[0,0,0]


    no_my_files=0

    #----------------------------------------------------------------
    logm("== Start mapping ==")

    for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines):
#    for read_file in my_files:
        original_bs_reads = {}
        no_my_files+=1
        random_id = ".tmp-"+str(random.randint(1000000,9999999))

        #-------------------------------------------------------------------
        # undirectional sequencing
        #-------------------------------------------------------------------
        if asktag=="Y":

            #----------------------------------------------------------------
            outfile2=tmp_d('Trimmed_C2T.fa'+random_id)
            outfile3=tmp_d('Trimmed_G2A.fa'+random_id)

            outf2=open(outfile2,'w')
            outf3=open(outfile3,'w')

            #----------------------------------------------------------------
            # detect format of input file
            try :
                read_inf=open(read_file,"r")
            except IOError :
                print "[Error] Cannot open input file : %s" % read_file
                exit(-1)

            oneline=read_inf.readline()
            l=oneline.split()
            input_format=""
            if oneline[0]=="@":	# fastq
                input_format="fastq"
                n_fastq=0
            elif len(l)==1 and oneline[0]!=">": # pure sequences
                input_format="seq"
            elif len(l)==11: # qseq
                input_format="qseq"
            elif oneline[0]==">":	# fasta
                input_format="fasta"
                n_fasta=0
            read_inf.close()

            #----------------------------------------------------------------
            # read sequence, remove adapter and convert
            read_id=""
            seq=""
            seq_ready="N"
            for line in fileinput.input(read_file):
                l=line.split()

                if input_format=="seq":
                    all_raw_reads+=1
                    read_id=str(all_raw_reads)
                    read_id=read_id.zfill(12)
                    seq=l[0]
                    seq_ready="Y"
                elif input_format=="fastq":
                    m_fastq=math.fmod(n_fastq,4)
                    n_fastq+=1
                    seq_ready="N"
                    if m_fastq==0:
                        all_raw_reads+=1
                        read_id=str(all_raw_reads)
                        read_id=read_id.zfill(12)
                        seq=""
                    elif m_fastq==1:
                        seq=l[0]
                        seq_ready="Y"
                    else:
                        seq=""
                elif input_format=="qseq":
                    all_raw_reads+=1
                    read_id=str(all_raw_reads)
                    read_id=read_id.zfill(12)
                    seq=l[8]
                    seq_ready="Y"
                elif input_format=="fasta":
                    m_fasta=math.fmod(n_fasta,2)
                    n_fasta+=1
                    seq_ready="N"
                    if m_fasta==0:
                        all_raw_reads+=1
                        #read_id=str(all_raw_reads)
                        read_id=l[0][1:]
                        seq=""
                    elif m_fasta==1:
                        seq=l[0]
                        seq_ready="Y"
                    else:
                        seq=""

                #----------------------------------------------------------------
                if seq_ready=="Y":
                    seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72  -e 52
                    seq=seq.upper()
                    seq=seq.replace(".","N")

                    # striping BS adapter from 3' read
                    if (adapter_fw !="") and (adapter_rc !="") :
                        new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch)
                        new_read = Remove_5end_Adapter(new_read, adapter_rc)
                        if len(new_read) < len(seq) :
                            all_trimed += 1
                        seq = new_read

                    if len(seq)<=4:
                        seq=''.join(["N" for x in xrange(cut2-cut1+1)])

                    #---------  trimmed_raw_BS_read  ------------------
                    original_bs_reads[read_id] = seq

                    #---------  FW_C2T  ------------------
                    outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))
                    #---------  RC_G2A  ------------------
                    outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A")))

            fileinput.close()

            outf2.close()
            outf3.close()

            delete_files(read_file)

           #--------------------------------------------------------------------------------
            # Bowtie mapping
            #-------------------------------------------------------------------------------
            WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
            CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)
            WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id)
            CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id)

        #    print aligner_command % {'int_no_mismatches' : int_no_mismatches,
        #                             'reference_genome' : os.path.join(db_path,'W_C2T'),
        #                             'input_file' : outfile2,
        #                             'output_file' : WC2T}

            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
                                                   'input_file' : outfile2,
                                                   'output_file' : WC2T},

                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
                                                   'input_file' : outfile2,
                                                   'output_file' : CC2T},

                              aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'),
                                                   'input_file' : outfile3,
                                                   'output_file' : WG2A},

                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'),
                                                   'input_file' : outfile3,
                                                   'output_file' : CG2A} ])


            delete_files(outfile2, outfile3)


            #--------------------------------------------------------------------------------
            # Post processing
            #--------------------------------------------------------------------------------

            FW_C2T_U,FW_C2T_R=extract_mapping(WC2T)
            RC_G2A_U,RC_G2A_R=extract_mapping(CG2A)

            FW_G2A_U,FW_G2A_R=extract_mapping(WG2A)
            RC_C2T_U,RC_C2T_R=extract_mapping(CC2T)

            #----------------------------------------------------------------
            # get unique-hit reads
            #----------------------------------------------------------------
            Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys())

            Unique_FW_C2T=set() # +
            Unique_RC_G2A=set() # +
            Unique_FW_G2A=set() # -
            Unique_RC_C2T=set() # -
            Multiple_hits=set()


            for x in Union_set:
                _list=[]
                for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]:
                    mis_lst=d.get(x,[99])
                    mis=int(mis_lst[0])
                    _list.append(mis)
                for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]:
                    mis=d.get(x,99)
                    _list.append(mis)
                mini=min(_list)
                if _list.count(mini) == 1:
                    mini_index=_list.index(mini)
                    if mini_index == 0:
                        Unique_FW_C2T.add(x)
                    elif mini_index == 1:
                        Unique_RC_G2A.add(x)
                    elif mini_index == 2:
                        Unique_FW_G2A.add(x)
                    elif mini_index == 3:
                        Unique_RC_C2T.add(x)
                    # if mini_index = 4,5,6,7, indicating multiple hits
                    else :
                        Multiple_hits.add(x)
                else :
                    Multiple_hits.add(x)
            # write reads rejected by Multiple Hits to file
            if show_multiple_hit :
                outf_MH=open("Multiple_hit.fa",'w')
                for i in Multiple_hits :
                    outf_MH.write(">%s\n" % i)
                    outf_MH.write("%s\n" % original_bs_reads[i])
                outf_MH.close()

            del Union_set
            del FW_C2T_R
            del FW_G2A_R
            del RC_C2T_R
            del RC_G2A_R

            FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
            FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A]
            RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
            RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A]
            FW_C2T_uniq_lst.sort()
            RC_C2T_uniq_lst.sort()
            FW_G2A_uniq_lst.sort()
            RC_G2A_uniq_lst.sort()
            FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
            RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]
            FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst]
            RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst]

            del Unique_FW_C2T
            del Unique_FW_G2A
            del Unique_RC_C2T
            del Unique_RC_G2A

            #----------------------------------------------------------------
            numbers_premapped_lst[0] += len(Unique_FW_C2T)
            numbers_premapped_lst[1] += len(Unique_RC_G2A)
            numbers_premapped_lst[2] += len(Unique_FW_G2A)
            numbers_premapped_lst[3] += len(Unique_RC_C2T)


            #----------------------------------------------------------------

            nn=0
            gseq = dict()
            chr_length = dict()
            for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),
                                            (RC_G2A_uniq_lst,RC_G2A_U),
                                            (FW_G2A_uniq_lst,FW_G2A_U),
                                            (RC_C2T_uniq_lst,RC_C2T_U)]:
                nn += 1
                mapped_chr0 = ""

                for header in ali_unique_lst:

                    _, mapped_chr, mapped_location, cigar = ali_dic[header]

                    original_BS = original_bs_reads[header]
                    #-------------------------------------
                    if mapped_chr not in gseq:
                        gseq[mapped_chr] =  deserialize(db_d(mapped_chr))
                        chr_length[mapped_chr] = len(gseq[mapped_chr])

                    if nn == 2 or nn == 3:
                        cigar = list(reversed(cigar))
                    r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)


                    all_mapped += 1

                    if nn == 1: # +FW mapped to + strand:
                        FR = "+FW"
                        mapped_strand="+"

                    elif nn == 2:  # +RC mapped to + strand:
                        FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+)
                        mapped_location = chr_length[mapped_chr] - mapped_location - g_len
                        mapped_strand = "+"
                        original_BS = reverse_compl_seq(original_BS)  # for RC reads

                    elif nn == 3:  						# -RC mapped to - strand:
                        mapped_strand = "-"
                        FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-)
                        original_BS = reverse_compl_seq(original_BS)  # for RC reads

                    elif nn == 4: 						# -FW mapped to - strand:
                        mapped_strand = "-"
                        FR = "-FW"
                        mapped_location = chr_length[mapped_chr] - mapped_location - g_len

                    origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)

                    r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)


                    if len(r_aln)==len(g_aln):
                        N_mismatch = N_MIS(r_aln, g_aln)
                        if N_mismatch <= int(max_mismatch_no):
                            numbers_mapped_lst[nn-1] += 1
                            all_mapped_passed += 1
                            methy = methy_seq(r_aln, g_aln + next)
                            mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)

                            #---XS FILTER----------------
                            XS = 0
                            nCH = methy.count('y') + methy.count('z')
                            nmCH = methy.count('Y') + methy.count('Z')
                            if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
                                XS = 1

                            outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)

            #----------------------------------------------------------------
            logm("--> %s (%d) "%(read_file, no_my_files))
            delete_files(WC2T, WG2A, CC2T, CG2A)


        #--------------------------------------------------------------------
        # directional sequencing
        #--------------------------------------------------------------------

        if asktag=="N":
            #----------------------------------------------------------------
            outfile2=tmp_d('Trimed_C2T.fa'+random_id)
            outf2=open(outfile2,'w')

            n=0
            #----------------------------------------------------------------
            try :
                read_inf=open(read_file,"r")
            except IOError :
                print "[Error] Cannot open input file : %s" % read_file
                exit(-1)

            oneline=read_inf.readline()
            l=oneline.split()
            input_format=""
            if oneline[0]=="@":	# FastQ
                input_format="fastq"
                n_fastq=0
            elif len(l)==1 and oneline[0]!=">": # pure sequences
                input_format="seq"
            elif len(l)==11: # Illumina GAII qseq file
                input_format="qseq"
            elif oneline[0]==">":	# fasta
                input_format="fasta"
                n_fasta=0
            read_inf.close()
            #print "detected data format: %s"%(input_format)
            #----------------------------------------------------------------
            read_id=""
            seq=""
            seq_ready="N"
            for line in fileinput.input(read_file):
                l=line.split()
                if input_format=="seq":
                    all_raw_reads+=1
                    read_id=str(all_raw_reads)
                    read_id=read_id.zfill(12)
                    seq=l[0]
                    seq_ready="Y"
                elif input_format=="fastq":
                    m_fastq=math.fmod(n_fastq,4)
                    n_fastq+=1
                    seq_ready="N"
                    if m_fastq==0:
                        all_raw_reads+=1
                        read_id=str(all_raw_reads)
                        read_id=read_id.zfill(12)
                        seq=""
                    elif m_fastq==1:
                        seq=l[0]
                        seq_ready="Y"
                    else:
                        seq=""
                elif input_format=="qseq":
                    all_raw_reads+=1
                    read_id=str(all_raw_reads)
                    read_id=read_id.zfill(12)
                    seq=l[8]
                    seq_ready="Y"
                elif input_format=="fasta":
                    m_fasta=math.fmod(n_fasta,2)
                    n_fasta+=1
                    seq_ready="N"
                    if m_fasta==0:
                        all_raw_reads+=1
                        read_id=l[0][1:]
                        seq=""
                    elif m_fasta==1:
                        seq=l[0]
                        seq_ready="Y"
                    else:
                        seq=""

                #--------------------------------
                if seq_ready=="Y":
                    seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72  -e 52
                    seq=seq.upper()
                    seq=seq.replace(".","N")

                    #--striping adapter from 3' read -------
                    if adapter != "":
                        new_read = RemoveAdapter(seq, adapter, adapter_mismatch)
                        if len(new_read) < len(seq) :
                            all_trimed += 1
                        seq = new_read

                    if len(seq)<=4:
                        seq = "N" * (cut2-cut1+1)

                    #---------  trimmed_raw_BS_read  ------------------
                    original_bs_reads[read_id] = seq


                    #---------  FW_C2T  ------------------
                    outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T")))

            fileinput.close()

            outf2.close()
            delete_files(read_file)

            #--------------------------------------------------------------------------------
            # Bowtie mapping
            #--------------------------------------------------------------------------------
            WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id)
            CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id)

            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
                                                  'input_file' : outfile2,
                                                  'output_file' : WC2T},
                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
                                                  'input_file' : outfile2,
                                                  'output_file' : CC2T} ])

            delete_files(outfile2)

            #--------------------------------------------------------------------------------
            # Post processing
            #--------------------------------------------------------------------------------


            FW_C2T_U, FW_C2T_R = extract_mapping(WC2T)
            RC_C2T_U, RC_C2T_R = extract_mapping(CC2T)

            #----------------------------------------------------------------
            # get uniq-hit reads
            #----------------------------------------------------------------
            Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys())

            Unique_FW_C2T = set() # +
            Unique_RC_C2T = set() # -
            Multiple_hits=set()
            # write reads rejected by Multiple Hits to file

            for x in Union_set:
                _list=[]
                for d in [FW_C2T_U,RC_C2T_U]:
                    mis_lst=d.get(x,[99])
                    mis=int(mis_lst[0])
                    _list.append(mis)
                for d in [FW_C2T_R,RC_C2T_R]:
                    mis=d.get(x,99)
                    _list.append(mis)
                mini=min(_list)
                #print _list
                if _list.count(mini)==1:
                    mini_index=_list.index(mini)
                    if mini_index==0:
                        Unique_FW_C2T.add(x)
                    elif mini_index==1:
                        Unique_RC_C2T.add(x)
                    else:
                        Multiple_hits.add(x)
                else :
                    Multiple_hits.add(x)
            # write reads rejected by Multiple Hits to file
            if show_multiple_hit :
                outf_MH=open("Multiple_hit.fa",'w')
                for i in Multiple_hits :
                    outf_MH.write(">%s\n" % i)
                    outf_MH.write("%s\n" % original_bs_reads[i])
                outf_MH.close()


            FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T]
            RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T]
            FW_C2T_uniq_lst.sort()
            RC_C2T_uniq_lst.sort()
            FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst]
            RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst]


            #----------------------------------------------------------------

            numbers_premapped_lst[0] += len(Unique_FW_C2T)
            numbers_premapped_lst[1] += len(Unique_RC_C2T)

            #----------------------------------------------------------------

            nn = 0
            gseq = dict()
            chr_length = dict()
            for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]:
                nn += 1
                mapped_chr0 = ""
                for header in ali_unique_lst:
                    _, mapped_chr, mapped_location, cigar = ali_dic[header]
                    original_BS = original_bs_reads[header]
                    #-------------------------------------
                    if mapped_chr not in gseq :
                        gseq[mapped_chr] = deserialize(db_d(mapped_chr))
                        chr_length[mapped_chr] = len(gseq[mapped_chr])
                    #if mapped_chr != mapped_chr0:
                    #    my_gseq = deserialize(db_d(mapped_chr))
                    #    chr_length = len(my_gseq)
                    #    mapped_chr0 = mapped_chr
                    #-------------------------------------

                    r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar)

                    all_mapped+=1
                    if nn == 1: 	# +FW mapped to + strand:
                        FR = "+FW"
                        mapped_strand = "+"
                    elif nn == 2: 	# -FW mapped to - strand:
                        mapped_strand = "-"
                        FR = "-FW"
                        mapped_location = chr_length[mapped_chr] - mapped_location - g_len


                    origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand)
                    r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome)

                    if len(r_aln) == len(g_aln):
                        N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides
                        if N_mismatch <= int(max_mismatch_no):
                            numbers_mapped_lst[nn-1] += 1
                            all_mapped_passed += 1
                            methy = methy_seq(r_aln, g_aln+next)
                            mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst)

                            #---XS FILTER----------------
                            XS = 0
                            nCH = methy.count('y') + methy.count('z')
                            nmCH = methy.count('Y') + methy.count('Z')
                            if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) :
                                XS = 1

                            outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome)

            #----------------------------------------------------------------
            logm("--> %s (%d) "%(read_file,no_my_files))
            delete_files(WC2T, CC2T)


    #----------------------------------------------------------------

#    outf.close()
    delete_files(tmp_path)

    logm("Number of raw reads: %d \n"% all_raw_reads)
    if all_raw_reads >0:
        logm("Number of reads having adapter removed: %d \n" % all_trimed )
        logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
        logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped)
        if asktag=="Y":
            logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
            logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
            logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
            logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
        elif asktag=="N":
            logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
            logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )

        logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
        if asktag=="Y":
            logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
            logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) )
            logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) )
            logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) )
        elif asktag=="N":
            logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) )
            logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) )
        logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )

        n_CG=mC_lst[0]+uC_lst[0]
        n_CHG=mC_lst[1]+uC_lst[1]
        n_CHH=mC_lst[2]+uC_lst[2]

        logm("----------------------------------------------" )
        logm("Methylated C in mapped reads ")

        logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0))
        logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0))
        logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0))

    logm("------------------- END --------------------" )
    elapsed("=== END %s ===" % main_read_file)
author	weilong-guo
date	Fri, 12 Jul 2013 18:47:28 -0400
parents
children	8b26adf64adc