Mercurial > repos > weilong-guo > bs_seeker2

diff BSseeker2/bs_align/bs_pair_end.py @ 0:e6df770c0e58 draft
Initial upload
author: weilong-guo
date: Fri, 12 Jul 2013 18:47:28 -0400
children: 8b26adf64adc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BSseeker2/bs_align/bs_pair_end.py	Fri Jul 12 18:47:28 2013 -0400
@@ -0,0 +1,1011 @@
+import fileinput, os, random, math
+from bs_utils.utils import *
+from bs_align_utils import *
+#----------------------------------------------------------------
+def extract_mapping(ali_file):
+    unique_hits = {}
+    non_unique_hits = {}
+    header0 = ""
+    family = []
+    for header, chr, no_mismatch, location1, cigar1, location2, cigar2 in process_aligner_output(ali_file, pair_end = True):
+        #------------------------
+        if header != header0:
+
+            # --- output ----
+            if len(family) == 1:
+                unique_hits[header0] = family[0]
+            elif len(family) > 1:
+                min_lst = min(family, key = lambda x: x[0])
+                max_lst = max(family, key = lambda x: x[0])
+
+                if min_lst[0] < max_lst[0]:
+                    unique_hits[header0] = min_lst
+                else:
+                    non_unique_hits[header0] = min_lst[0]
+
+            header0 = header
+            family = []
+        family.append((no_mismatch, chr, location1, cigar1, location2, cigar2))
+
+    if len(family) == 1:
+        unique_hits[header0] = family[0]
+    elif len(family) > 1:
+        min_lst = min(family, key = lambda x: x[0])
+        max_lst = max(family, key = lambda x: x[0])
+
+        if min_lst[0] < max_lst[0]:
+            unique_hits[header0] = min_lst
+        else:
+            non_unique_hits[header0] = min_lst[0]
+
+    return unique_hits, non_unique_hits
+
+def _extract_mapping(ali_file):
+    U = {}
+    R = {}
+    header0 = ""
+    family = []
+    for line in fileinput.input(ali_file):
+        l = line.split()
+        header = l[0][:-2]
+        chr = str(l[1])
+        location = int(l[2])
+        #no_hits=int(l[4])
+        #-------- mismatches -----------
+        if len(l) == 4:
+            no_mismatch = 0
+        elif len(l) == 5:
+            no_mismatch = l[4].count(":")
+        else:
+            print l
+        #------------------------
+        if header != header0:
+            #--------------------
+            if header0 != "":
+                # --- output ----
+                if len(family) == 1:
+                    U[header0] = family[0]
+                else:
+                    if family[0][0] < family[1][0]:
+                        U[header0] = family[0]
+                    elif family[1][0] < family[0][0]:
+                        U[header0] = family[1]
+                    else:
+                        R[header0] = family[0][0]
+                family=[]
+                # ---------------
+            header0 = header
+            family = [[no_mismatch, chr, location]]
+            member = 1
+        elif header == header0:
+            if member == 1:
+                family[-1][0] += no_mismatch
+                family[-1].append(location)
+                member = 2
+            elif member == 2:
+                family.append([no_mismatch, chr, location])
+                member = 1
+        #------------------------------
+
+    fileinput.close()
+    return U, R
+
+
+#----------------------------------------------------------------
+
+def bs_pair_end(main_read_file_1,
+                main_read_file_2,
+                asktag,
+                adapter_file,
+                cut1,
+                cut2, # add cut3 and cut4?
+                no_small_lines,
+                max_mismatch_no,
+                aligner_command,
+                db_path,
+                tmp_path,
+                outfile, XS_pct, XS_count, show_multiple_hit=False):
+
+
+    #----------------------------------------------------------------
+    adapter=""
+    adapterA=""
+    adapterB=""
+    if adapter_file !="":
+        try :
+            adapter_inf=open(adapter_file,"r")
+            if asktag=="N": #<--- directional library
+                adapter=adapter_inf.readline()
+                adapter_inf.close()
+                adapter=adapter.rstrip("\n")
+            elif asktag=="Y":#<--- un-directional library
+                adapterA=adapter_inf.readline()
+                adapterB=adapter_inf.readline()
+                adapter_inf.close()
+                adapterA=adapterA.rstrip("\n")
+                adapterB=adapterB.rstrip("\n")
+        except IOError :
+            print "[Error] Cannot find adapter file : %s" % adapter_file
+            exit(-1)
+
+
+    #----------------------------------------------------------------
+
+    logm("End 1 filename: %s"% main_read_file_1  )
+    logm("End 2 filename: %s"% main_read_file_2  )
+    logm("The first base (for mapping): %d"% cut1  )
+    logm("The last base (for mapping): %d"% cut2  )
+
+    logm("-------------------------------- " )
+    logm("Un-directional library: %s" % asktag  )
+    logm("Path for short reads aligner: %s"% aligner_command + '\n')
+    logm("Reference genome library path: %s"% db_path  )
+    logm("Number of mismatches allowed: %s"% max_mismatch_no  )
+
+    if adapter_file !="":
+        if asktag=="Y":
+            logm("Adapters to be removed from 3' of the reads:" )
+            logm("-- A: %s" % adapterA  )
+            logm("-- B: %s" % adapterB  )
+        elif asktag=="N":
+            logm("Adapter to be removed from 3' of the reads:" )
+            logm("-- %s" % adapter  )
+
+    logm("-------------------------------- " )
+
+    #----------------------------------------------------------------
+
+    # helper method to join fname with tmp_path
+    tmp_d = lambda fname: os.path.join(tmp_path, fname)
+
+    db_d = lambda fname: os.path.join(db_path, fname)
+
+
+    #----------------------------------------------------------------
+    # splitting the 2 big read files
+
+    input_fname1 = os.path.split(main_read_file_1)[1]
+    input_fname2 = os.path.split(main_read_file_2)[1]
+
+    # TODO: run these in parallel with a subprocess
+    split_file(main_read_file_1, tmp_d(input_fname1)+'-E1-', no_small_lines)
+    split_file(main_read_file_2, tmp_d(input_fname2)+'-E2-', no_small_lines)
+
+    dirList=os.listdir(tmp_path)
+    my_files = zip(sorted(filter(lambda fname: fname.startswith("%s-E1-" % input_fname1), dirList)),
+                   sorted(filter(lambda fname: fname.startswith("%s-E2-" % input_fname2), dirList)))
+
+
+
+    #---- Stats ------------------------------------------------------------
+    all_raw_reads=0
+    all_trimmed=0
+    all_mapped=0
+    all_mapped_passed=0
+
+    all_unmapped=0
+
+    numbers_premapped_lst=[0,0,0,0]
+    numbers_mapped_lst=[0,0,0,0]
+
+    mC_lst=[0,0,0]
+    uC_lst=[0,0,0]
+
+    no_my_files=0
+
+    #----------------------------------------------------------------
+    print "== Start mapping =="
+
+    for read_file_1, read_file_2 in my_files:
+
+        no_my_files+=1
+
+        random_id=".tmp-"+str(random.randint(1000000,9999999))
+
+        original_bs_reads_1 = {}
+        original_bs_reads_2 = {}
+        original_bs_reads_lst= [original_bs_reads_1, original_bs_reads_2]
+
+
+        if asktag == "Y" :
+
+            #----------------------------------------------------------------
+            outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id)
+            outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id)
+            outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id)
+            outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id)
+
+            try :
+                read_inf = open(tmp_d(read_file_1),"r")
+            except IOError :
+                print "[Error] Cannot open file : %s", tmp_d(read_file_1)
+                exit(-1)
+            oneline = read_inf.readline()
+            l = oneline.split()
+            input_format = ""
+
+            if oneline[0]=="@":	# Illumina GAII FastQ (Lister et al Nature 2009)
+                input_format="fastq"
+                n_fastq=0
+            elif len(l)==1 and oneline[0]!=">": 	# pure sequences
+                input_format="seq"
+            elif len(l)==11:	# Illumina GAII qseq file
+                input_format="qseq"
+            elif oneline[0]==">":	# fasta
+                input_format="fasta"
+                n_fasta=0
+            read_inf.close()
+
+            print "Detected data format: %s" % input_format
+
+            #----------------------------------------------------------------
+            read_file_list   = [read_file_1, read_file_2]
+            outfile_FCT_list = [outfile_1FCT, outfile_2FCT]
+            outfile_RCT_list = [outfile_1RCT, outfile_2RCT]
+            n_list = [0, 0]
+
+
+            for f in range(2):
+                read_file = read_file_list[f]
+                outf_FCT = open(outfile_FCT_list[f], 'w')
+                outf_RCT = open(outfile_RCT_list[f], 'w')
+                original_bs_reads = original_bs_reads_lst[f]
+                n = n_list[f]
+
+                seq_id = ""
+                seq = ""
+                seq_ready = "N"
+                for line in fileinput.input(tmp_d(read_file)):
+                    l=line.split()
+                    if input_format=="seq":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[0]
+                        seq_ready="Y"
+                    elif input_format=="fastq":
+                        m_fastq=math.fmod(n_fastq,4)
+                        n_fastq+=1
+                        seq_ready="N"
+                        if m_fastq==0:
+                            n+=1
+                            seq_id=str(n)
+                            seq_id=seq_id.zfill(12)
+                            seq=""
+                        elif m_fastq==1:
+                            seq=l[0]
+                            seq_ready="Y"
+                        else:
+                            seq=""
+                    elif input_format=="qseq":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[8]
+                        seq_ready="Y"
+                    elif input_format=="fasta":
+                        m_fasta=math.fmod(n_fasta,2)
+                        n_fasta+=1
+                        seq_ready="N"
+                        if m_fasta==0:
+                            n+=1
+                            seq_id=l[0][1:]
+                            seq_id=seq_id.zfill(17)
+                            seq=""
+                        elif m_fasta==1:
+                            seq=l[0]
+                            seq_ready="Y"
+                        else:
+                            seq=""
+                    #----------------------------------------------------------------
+                    if seq_ready=="Y":
+                        seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72  -e 52
+                        seq=seq.upper()
+                        seq=seq.replace(".","N")
+
+                        #--striping BS adapter from 3' read --------------------------------------------------------------
+                        if (adapterA !="") and (adapterB !=""):
+                            signature=adapterA[:6]
+                            if signature in seq:
+                                signature_pos=seq.index(signature)
+                                if seq[signature_pos:] in adapterA:
+                                    seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
+                                    all_trimmed+=1
+                            else:
+                                signature=adapterB[:6]
+                                if signature in seq:
+                                    #print seq_id,seq,signature;
+                                    signature_pos=seq.index(signature)
+                                    if seq[signature_pos:] in adapterB:
+                                        seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
+                                        all_trimmed+=1
+
+                        if len(seq) <= 4:
+                            seq = "N" * (cut2-cut1+1)
+                        #---------  trimmed_raw_BS_read  ------------------
+                        original_bs_reads[seq_id] = seq
+
+                        #---------  FW_C2T  ------------------
+                        outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T")))
+                        #---------  RC_G2A  ------------------
+                        outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A")))
+
+                n_list[f]=n
+
+                outf_FCT.close()
+                outf_RCT.close()
+
+                fileinput.close()
+
+
+            #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
+            all_raw_reads+=n
+
+            #--------------------------------------------------------------------------------
+            # Bowtie mapping
+            #--------------------------------------------------------------------------------
+            WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
+            WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id)
+
+            run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                                      'input_file_1' : outfile_1FCT,
+                                                      'input_file_2' : outfile_2RCT,
+                                                      'output_file' : WC2T_fr},
+
+                             aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                                      'input_file_1' : outfile_1FCT,
+                                                      'input_file_2' : outfile_2RCT,
+                                                      'output_file' : CC2T_fr},
+
+                             aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                                      'input_file_1' : outfile_2FCT,
+                                                      'input_file_2' : outfile_1RCT,
+                                                      'output_file' : WC2T_rf},
+
+                             aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                                      'input_file_1' : outfile_2FCT,
+                                                      'input_file_2' : outfile_1RCT,
+                                                      'output_file' : CC2T_rf}])
+
+
+            delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT)
+
+
+            #--------------------------------------------------------------------------------
+            # Post processing
+            #--------------------------------------------------------------------------------
+
+
+            FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
+            FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf)
+            RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
+            RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf)
+
+            delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf)
+
+            #----------------------------------------------------------------
+            # get uniq-hit reads
+            #----------------------------------------------------------------
+            Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys())
+
+            Unique_FW_fr_C2T=set() # +
+            Unique_FW_rf_C2T=set() # +
+            Unique_RC_fr_C2T=set() # -
+            Unique_RC_rf_C2T=set() # -
+            Multiple_hits=set()
+
+
+            for x in Union_set:
+                _list=[]
+                for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]:
+                    mis_lst=d.get(x,[99])
+                    mis=int(mis_lst[0])
+                    _list.append(mis)
+                for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]:
+                    mis=d.get(x,99)
+                    _list.append(mis)
+                mini=min(_list)
+                if _list.count(mini)==1:
+                    mini_index=_list.index(mini)
+                    if mini_index==0:
+                        Unique_FW_fr_C2T.add(x)
+                    elif mini_index==1:
+                        Unique_FW_rf_C2T.add(x)
+                    elif mini_index==2:
+                        Unique_RC_fr_C2T.add(x)
+                    elif mini_index==3:
+                        Unique_RC_rf_C2T.add(x)
+                    else :
+                        Multiple_hits.add(x)
+                else :
+                    Multiple_hits.add(x)
+
+            # write reads rejected by Multiple Hits to file
+            if show_multiple_hit :
+                outf_MH=open("Multiple_hit.fa",'w')
+                for i in Multiple_hits :
+                    outf_MH.write(">%s\n" % i)
+                    outf_MH.write("%s\n" % original_bs_reads[i])
+                outf_MH.close()
+
+            del Union_set
+            del FW_C2T_fr_R
+            del FW_C2T_rf_R
+            del RC_C2T_fr_R
+            del RC_C2T_rf_R
+
+            FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
+            FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T]
+            RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
+            RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T]
+
+            FW_C2T_fr_uniq_lst.sort()
+            FW_C2T_rf_uniq_lst.sort()
+            RC_C2T_fr_uniq_lst.sort()
+            RC_C2T_rf_uniq_lst.sort()
+            FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
+            FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst]
+            RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
+            RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst]
+            #----------------------------------------------------------------
+
+            numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
+            numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T)
+            numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T)
+            numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T)
+
+            del Unique_FW_fr_C2T
+            del Unique_FW_rf_C2T
+            del Unique_RC_fr_C2T
+            del Unique_RC_rf_C2T
+
+            #----------------------------------------------------------------
+
+            nn = 0
+            for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U),
+                                            (FW_C2T_rf_uniq_lst,FW_C2T_rf_U),
+                                            (RC_C2T_fr_uniq_lst,RC_C2T_fr_U),
+                                            (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]:
+                nn += 1
+
+                mapped_chr0 = ""
+                for header in ali_unique_lst:
+
+                    _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
+
+                    #-------------------------------------
+                    if mapped_chr != mapped_chr0:
+                        my_gseq=deserialize(db_d(mapped_chr))
+                        chr_length=len(my_gseq)
+                        mapped_chr0=mapped_chr
+                    #-------------------------------------
+                    if nn == 1 or nn == 3:
+                        original_BS_1 = original_bs_reads_1[header]
+                        original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
+                    else:
+                        original_BS_1 = original_bs_reads_2[header]
+                        original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
+
+                    r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
+                    r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
+
+                    all_mapped += 1
+
+                    if nn == 1: 							# FW-RC mapped to + strand:
+
+                        FR = "+FR"
+
+#                        mapped_location_1 += 1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "+"
+
+#                        mapped_location_2 += 1
+#                        origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "+"
+
+                    elif nn==2: 							# RC-FW mapped to + strand:
+
+#                        original_BS_1 = original_bs_reads_2[header]
+#                        original_BS_2 = reverse_compl_seq(original_bs_reads_1[header])
+                        FR = "+RF"
+#                        mapped_location_1 += 1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "+"
+
+#                        mapped_location_2 += 1
+#                        origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "+"
+
+
+                    elif nn==3: 						# FW-RC mapped to - strand:
+#                        original_BS_1=original_bs_reads_1[header]
+#                        original_BS_2=reverse_compl_seq(original_bs_reads_2[header])
+
+                        FR = "-FR"
+                        mapped_location_1 = chr_length - mapped_location_1 - g_len_1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "-"
+
+                        mapped_location_2 = chr_length - mapped_location_2 - g_len_2
+#                        origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ])
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "-"
+
+                    elif nn==4: 						# RC-FW mapped to - strand:
+#                        original_BS_1=original_bs_reads_2[header]
+#                        original_BS_2=reverse_compl_seq(original_bs_reads_1[header])
+
+                        FR = "-RF"
+                        mapped_location_1 = chr_length - mapped_location_1 - g_len_1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "-"
+
+                        mapped_location_2 = chr_length - mapped_location_2 - g_len_2
+#                        origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "-"
+
+
+
+
+                    origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
+                    origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
+
+                    r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
+                    r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
+
+
+                    N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
+                    N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
+
+
+                    if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) :
+                        all_mapped_passed += 1
+                        numbers_mapped_lst[nn-1] += 1
+                        #---- unmapped -------------------------
+                        del original_bs_reads_1[header]
+                        del original_bs_reads_2[header]
+                        #---------------------------------------
+
+#                        output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
+#                        output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
+
+                        methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
+                        methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
+
+                        mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst)
+                        mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst)
+
+
+                        # 
+                        #---XS FILTER----------------
+                        #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
+                        XS_1 = 0
+                        nCH_1 = methy_1.count('y') + methy_1.count('z')
+                        nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
+                        if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
+                            XS_1 = 1
+                        #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
+                        XS_2 = 0
+                        nCH_2 = methy_2.count('y') + methy_2.count('z')
+                        nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
+                        if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
+                            XS_2 = 1
+
+
+                        outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
+                            output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
+                        outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
+                            output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
+
+            print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
+            #----------------------------------------------------------------
+            #	output unmapped pairs
+            #----------------------------------------------------------------
+
+            unmapped_lst=original_bs_reads_1.keys()
+            unmapped_lst.sort()
+
+#            for u in unmapped_lst:
+#                outf_u1.write("%s\n"%original_bs_reads_1[u])
+#                outf_u2.write("%s\n"%original_bs_reads_2[u])
+
+            all_unmapped += len(unmapped_lst)
+
+
+        if asktag=="N":
+
+            #----------------------------------------------------------------
+            outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id)
+            outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id)
+
+            try :
+                read_inf=open(tmp_d(read_file_1),"r")
+            except IOError :
+                print "[Error] Cannot open file : %s", tmp_d(read_file_1)
+                exit(-1)
+
+            oneline=read_inf.readline()
+            l=oneline.split()
+            input_format=""
+
+            #if len(l)==5: # old solexa format
+            #	input_format="old Solexa Seq file"
+
+            if oneline[0]=="@":	# Illumina GAII FastQ (Lister et al Nature 2009)
+                input_format="FastQ"
+                n_fastq=0
+            elif len(l)==1 and oneline[0]!=">": 	# pure sequences
+                input_format="_list of sequences"
+            elif len(l)==11:	# Illumina GAII qseq file
+                input_format="Illumina GAII qseq file"
+            elif oneline[0]==">":	# fasta
+                input_format="fasta"
+                n_fasta=0
+
+            read_inf.close()
+
+            print "Detected data format: %s" % input_format
+
+            #----------------------------------------------------------------
+            read_file_list=[read_file_1,read_file_2]
+            outfile_FCT_list=[outfile_1FCT,outfile_2FCT]
+            n_list=[0,0]
+
+            for f in range(2):
+                read_file=read_file_list[f]
+                outf_FCT=open(outfile_FCT_list[f],'w')
+                original_bs_reads = original_bs_reads_lst[f]
+                n=n_list[f]
+
+                seq_id=""
+                seq=""
+                seq_ready="N"
+                for line in fileinput.input(tmp_d(read_file)):
+                    l=line.split()
+                    if input_format=="old Solexa Seq file":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[4]
+                        seq_ready="Y"
+                    elif input_format=="_list of sequences":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[0]
+                        seq_ready="Y"
+                    elif input_format=="FastQ":
+                        m_fastq=math.fmod(n_fastq,4)
+                        n_fastq+=1
+                        seq_ready="N"
+                        if m_fastq==0:
+                            n+=1
+                            seq_id=str(n)
+                            seq_id=seq_id.zfill(12)
+                            seq=""
+                        elif m_fastq==1:
+                            seq=l[0]
+                            seq_ready="Y"
+                        else:
+                            seq=""
+                    elif input_format=="Illumina GAII qseq file":
+                        n+=1
+                        seq_id=str(n)
+                        seq_id=seq_id.zfill(12)
+                        seq=l[8]
+                        seq_ready="Y"
+                    elif input_format=="fasta":
+                        m_fasta=math.fmod(n_fasta,2)
+                        n_fasta+=1
+                        seq_ready="N"
+                        if m_fasta==0:
+                            n+=1
+                            seq_id=l[0][1:]
+                            seq_id=seq_id.zfill(17)
+                            seq=""
+                        elif m_fasta==1:
+                            seq=l[0]
+                            seq_ready="Y"
+                        else:
+                            seq=""
+                    #----------------------------------------------------------------
+                    if seq_ready=="Y":
+                        seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72  -e 52
+                        seq=seq.upper()
+                        seq=seq.replace(".","N")
+
+                        #--striping BS adapter from 3' read --------------------------------------------------------------
+                        if (adapterA !="") and (adapterB !=""):
+                            signature=adapterA[:6]
+                            if signature in seq:
+                                signature_pos=seq.index(signature)
+                                if seq[signature_pos:] in adapterA:
+                                    seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
+                                    all_trimmed+=1
+                            else:
+                                signature=adapterB[:6]
+                                if signature in seq:
+                                    #print seq_id,seq,signature;
+                                    signature_pos=seq.index(signature)
+                                    if seq[signature_pos:] in adapterB:
+                                        seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))])
+                                        all_trimmed+=1
+
+                        if len(seq) <= 4:
+                            seq = "N" * (cut2-cut1+1)
+                        #---------  trimmed_raw_BS_read  ------------------
+                        original_bs_reads[seq_id] = seq
+
+                        #---------  FW_C2T  ------------------
+                        if f==0:
+                            outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T")))
+                        elif f==1:
+                            outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T")))
+
+
+                n_list[f]=n
+                outf_FCT.close()
+
+                fileinput.close()
+
+
+            #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]);
+            all_raw_reads+=n
+
+            #--------------------------------------------------------------------------------
+            # Bowtie mapping
+            #--------------------------------------------------------------------------------
+            WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
+            CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id)
+
+            run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'),
+                                         'input_file_1' : outfile_1FCT,
+                                         'input_file_2' : outfile_2FCT,
+                                         'output_file' : WC2T_fr},
+
+                              aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'),
+                                         'input_file_1' : outfile_1FCT,
+                                         'input_file_2' : outfile_2FCT,
+                                         'output_file' : CC2T_fr} ])
+
+
+            delete_files(outfile_1FCT, outfile_2FCT)
+
+            #--------------------------------------------------------------------------------
+            # Post processing
+            #--------------------------------------------------------------------------------
+
+            FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr)
+            RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr)
+
+            #----------------------------------------------------------------
+            # get uniq-hit reads
+            #----------------------------------------------------------------
+            Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys())
+
+            Unique_FW_fr_C2T = set() # +
+            Unique_RC_fr_C2T = set() # -
+            Multiple_hits=set()
+
+
+            for x in Union_set:
+                _list = []
+                for d in [FW_C2T_fr_U, RC_C2T_fr_U]:
+                    mis_lst = d.get(x,[99])
+                    mis = int(mis_lst[0])
+                    _list.append(mis)
+                for d in [FW_C2T_fr_R, RC_C2T_fr_R]:
+                    mis = d.get(x,99)
+                    _list.append(mis)
+                mini = min(_list)
+                if _list.count(mini) == 1:
+                    mini_index = _list.index(mini)
+                    if mini_index == 0:
+                        Unique_FW_fr_C2T.add(x)
+                    elif mini_index == 1:
+                        Unique_RC_fr_C2T.add(x)
+                    else :
+                        Multiple_hits.add(x)
+                else :
+                    Multiple_hits.add(x)
+
+           # write reads rejected by Multiple Hits to file
+            if show_multiple_hit :
+                outf_MH=open("Multiple_hit.fa",'w')
+                for i in Multiple_hits :
+                    outf_MH.write(">%s\n" % i)
+                    outf_MH.write("%s\n" % original_bs_reads[i])
+                outf_MH.close()
+
+            FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T]
+            RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T]
+
+            FW_C2T_fr_uniq_lst.sort()
+            RC_C2T_fr_uniq_lst.sort()
+            FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst]
+            RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst]
+            #----------------------------------------------------------------
+
+            numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T)
+            numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T)
+
+
+            #----------------------------------------------------------------
+
+            nn = 0
+            for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]:
+                nn += 1
+                mapped_chr0 = ""
+                for header in ali_unique_lst:
+                    _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header]
+
+
+                    #-------------------------------------
+                    if mapped_chr != mapped_chr0:
+                        my_gseq = deserialize(db_d(mapped_chr))
+                        chr_length = len(my_gseq)
+                        mapped_chr0 = mapped_chr
+                    #-------------------------------------
+
+                    original_BS_1 = original_bs_reads_1[header]
+                    original_BS_2 = reverse_compl_seq(original_bs_reads_2[header])
+
+                    r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1)
+                    r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2)
+
+                    all_mapped += 1
+
+                    if nn == 1: 							# FW-RC mapped to + strand:
+                        FR = "+FR"
+#                        mapped_location_1 += 1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "+"
+
+#                        mapped_location_2 += 1
+#                        origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "+"
+
+                    elif nn == 2: 						# FW-RC mapped to - strand:
+
+                        FR="-FR"
+                        mapped_location_1 = chr_length - mapped_location_1 - g_len_1
+#                        origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1]
+#                        origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1)
+#                        origin_genome_1 = origin_genome_long_1[2:-2]
+                        mapped_strand_1 = "-"
+
+                        mapped_location_2 = chr_length - mapped_location_2 - g_len_2
+#                        origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1])
+#                        origin_genome_2 = origin_genome_long_2[2:-2]
+                        mapped_strand_2 = "-"
+
+
+
+                    origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1)
+                    origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2)
+
+
+                    r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1)
+                    r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2)
+
+                    N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides
+                    N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides
+
+                    if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no):
+
+                        numbers_mapped_lst[nn-1] += 1
+                        all_mapped_passed += 1
+
+                        #---- unmapped -------------------------
+                        del original_bs_reads_1[header]
+                        del original_bs_reads_2[header]
+
+                        #---------------------------------------
+#                        output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:]
+#                        output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:]
+                        methy_1=methy_seq(r_aln_1, g_aln_1 + next_1)
+                        methy_2=methy_seq(r_aln_2, g_aln_2 + next_2)
+                        mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst)
+                        mC_lst,uC_lst = mcounts(methy_2, mC_lst, uC_lst)
+
+                        # 
+                        #---XS FILTER----------------
+                        #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
+                        XS_1 = 0
+                        nCH_1 = methy_1.count('y') + methy_1.count('z')
+                        nmCH_1 = methy_1.count('Y') + methy_1.count('Z')
+                        if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) :
+                            XS_1 = 1
+                        #XS = 1 if "ZZZ" in methy.replace('-', '') else 0
+                        XS_2 = 0
+                        nCH_2 = methy_2.count('y') + methy_2.count('z')
+                        nmCH_2 = methy_2.count('Y') + methy_2.count('Z')
+                        if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) :
+                            XS_2 = 1
+
+
+                        outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1,
+                                      output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2)
+                        outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2,
+                                      output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1)
+
+            print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files))
+            #----------------------------------------------------------------
+            #	output unmapped pairs
+            #----------------------------------------------------------------
+
+            unmapped_lst=original_bs_reads_1.keys()
+            unmapped_lst.sort()
+
+#            for u in unmapped_lst:
+#                outf_u1.write("%s\n"%(original_bs_reads_1[u]))
+#                outf_u2.write("%s\n"%(original_bs_reads_2[u]) )
+
+            all_unmapped+=len(unmapped_lst)
+
+
+    #==================================================================================================
+#    outf.close()
+#
+#    outf_u1.close()
+#    outf_u2.close()
+    delete_files(tmp_path)
+
+    logm("-------------------------------- " )
+    logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) )
+    logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n")
+
+    if all_raw_reads >0:
+
+        logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n")
+        if asktag=="Y":
+            logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
+            logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) )
+            logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) )
+            logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) )
+        elif asktag=="N":
+            logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) )
+            logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) )
+
+
+        logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) )
+        logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) )
+        if asktag=="Y":
+            logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
+            logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) )
+            logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) )
+            logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) )
+        elif asktag=="N":
+            logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) )
+            logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) )
+
+        logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) )
+        logm("O Unmapped read pairs: %d"% all_unmapped+"\n")
+
+
+        n_CG=mC_lst[0]+uC_lst[0]
+        n_CHG=mC_lst[1]+uC_lst[1]
+        n_CHH=mC_lst[2]+uC_lst[2]
+
+        logm("-------------------------------- " )
+        logm("Methylated C in mapped reads " )
+        logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0) )
+        logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0) )
+        logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0) )
+
+    logm("----------------------------------------------" )
+    logm("------------------- END ----------------------" )
+    elapsed("=== END %s %s ===" % (main_read_file_1, main_read_file_2))
author	weilong-guo
date	Fri, 12 Jul 2013 18:47:28 -0400
parents
children	8b26adf64adc