Mercurial > repos > weilong-guo > bs_seeker2
diff BSseeker2/bs_align/bs_pair_end.py @ 1:8b26adf64adc draft default tip
V2.0.5
author | weilong-guo |
---|---|
date | Tue, 05 Nov 2013 01:55:39 -0500 |
parents | e6df770c0e58 |
children |
line wrap: on
line diff
--- a/BSseeker2/bs_align/bs_pair_end.py Fri Jul 12 18:47:28 2013 -0400 +++ b/BSseeker2/bs_align/bs_pair_end.py Tue Nov 05 01:55:39 2013 -0500 @@ -104,13 +104,13 @@ aligner_command, db_path, tmp_path, - outfile, XS_pct, XS_count, show_multiple_hit=False): + outfile, XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False): #---------------------------------------------------------------- - adapter="" - adapterA="" - adapterB="" + adapter = "" + adapterA = "" + adapterB = "" if adapter_file !="": try : adapter_inf=open(adapter_file,"r") @@ -134,12 +134,15 @@ logm("End 1 filename: %s"% main_read_file_1 ) logm("End 2 filename: %s"% main_read_file_2 ) logm("The first base (for mapping): %d"% cut1 ) - logm("The last base (for mapping): %d"% cut2 ) + logm("The last base (for mapping): %d"% cut2 ) - logm("-------------------------------- " ) - logm("Un-directional library: %s" % asktag ) logm("Path for short reads aligner: %s"% aligner_command + '\n') logm("Reference genome library path: %s"% db_path ) + + if asktag == "Y" : + logm("Un-directional library" ) + else : + logm("Directional library") logm("Number of mismatches allowed: %s"% max_mismatch_no ) if adapter_file !="": @@ -182,6 +185,9 @@ all_trimmed=0 all_mapped=0 all_mapped_passed=0 + all_base_before_trim=0 + all_base_after_trim=0 + all_base_mapped=0 all_unmapped=0 @@ -192,6 +198,8 @@ uC_lst=[0,0,0] no_my_files=0 + read_id_lst_1 = dict() + read_id_lst_2 = dict() #---------------------------------------------------------------- print "== Start mapping ==" @@ -212,11 +220,14 @@ #---------------------------------------------------------------- outfile_1FCT = tmp_d('Trimmed_FCT_1.fa'+random_id) outfile_1RCT = tmp_d('Trimmed_RCT_1.fa'+random_id) - outfile_2FCT = tmp_d('Trimmed_FCT_2.fa'+random_id) + outfile_2RGA = tmp_d('Trimmed_FCT_2.fa'+random_id) outfile_2RCT = tmp_d('Trimmed_RCT_2.fa'+random_id) try : - read_inf = open(tmp_d(read_file_1),"r") + if read_file_1.endswith(".gz") : # support input file ending with ".gz" + read_inf = gzip.open(tmp_d(read_file_1), "rb") + else : + read_inf = open(tmp_d(read_file_1),"r") except IOError : print "[Error] Cannot open file : %s", tmp_d(read_file_1) exit(-1) @@ -226,128 +237,131 @@ if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) input_format="fastq" - n_fastq=0 elif len(l)==1 and oneline[0]!=">": # pure sequences input_format="seq" elif len(l)==11: # Illumina GAII qseq file input_format="qseq" elif oneline[0]==">": # fasta input_format="fasta" - n_fasta=0 read_inf.close() print "Detected data format: %s" % input_format #---------------------------------------------------------------- read_file_list = [read_file_1, read_file_2] - outfile_FCT_list = [outfile_1FCT, outfile_2FCT] + outfile_FCT_list = [outfile_1FCT, outfile_2RGA] outfile_RCT_list = [outfile_1RCT, outfile_2RCT] n_list = [0, 0] - - for f in range(2): read_file = read_file_list[f] outf_FCT = open(outfile_FCT_list[f], 'w') - outf_RCT = open(outfile_RCT_list[f], 'w') + outf_RGA = open(outfile_RCT_list[f], 'w') original_bs_reads = original_bs_reads_lst[f] n = n_list[f] - - seq_id = "" + read_id = "" seq = "" seq_ready = "N" + line_no = 0 for line in fileinput.input(tmp_d(read_file)): - l=line.split() + l = line.split() + line_no += 1 if input_format=="seq": - n+=1 - seq_id=str(n) - seq_id=seq_id.zfill(12) - seq=l[0] - seq_ready="Y" + n += 1 + read_id = str(n) + read_id = read_id.zfill(12) + if f == 1 : + read_id_lst_1[read_id] = read_id + else : + read_id_lst_2[read_id] = read_id + seq = l[0] + seq_ready = "Y" elif input_format=="fastq": - m_fastq=math.fmod(n_fastq,4) - n_fastq+=1 - seq_ready="N" - if m_fastq==0: - n+=1 - seq_id=str(n) - seq_id=seq_id.zfill(12) - seq="" - elif m_fastq==1: - seq=l[0] - seq_ready="Y" - else: - seq="" + l_fastq = math.fmod(line_no, 4) + if l_fastq==1 : + all_raw_reads += 1 + read_id = str(line_no/4+1).zfill(12) + if f==1 : + read_id_lst_1[read_id] = l[0][1:] + else : + read_id_lst_2[read_id] = l[0][1:] + seq_ready="N" + elif l_fastq==2 : + seq = l[0] + seq_ready = "Y" + else : + seq = "" + seq_ready = "N" elif input_format=="qseq": - n+=1 - seq_id=str(n) - seq_id=seq_id.zfill(12) - seq=l[8] - seq_ready="Y" + all_raw_reads += 1 + read_id = str(line_no) + read_id = read_id.zfill(12) + if f == 1 : + read_id_lst_1[read_id] = l[0][1:] + else : + read_id_lst_2[read_id] = l[0][1:] + seq = l[8] + seq_ready = "Y" elif input_format=="fasta": - m_fasta=math.fmod(n_fasta,2) - n_fasta+=1 - seq_ready="N" - if m_fasta==0: - n+=1 - seq_id=l[0][1:] - seq_id=seq_id.zfill(17) - seq="" - elif m_fasta==1: - seq=l[0] - seq_ready="Y" - else: - seq="" + l_fasta = math.fmod(line_no,2) + seq_ready = "N" + if l_fasta==1: + all_raw_reads += 1 + read_id = str(line_no/2+1).zfill(12) + if f==1 : + read_id_lst_1[read_id] = l[0][1:] + else : + read_id_lst_2[read_id] = l[0][1:] + seq = "" + elif l_fasta==0 : + seq = l[0] + seq_ready = "Y" #---------------------------------------------------------------- if seq_ready=="Y": - seq=seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52 - seq=seq.upper() - seq=seq.replace(".","N") + seq = seq[cut1-1:cut2] #------- selecting 0..52 from 1..72 -e 52 + seq = seq.upper() + seq = seq.replace(".","N") #--striping BS adapter from 3' read -------------------------------------------------------------- - if (adapterA !="") and (adapterB !=""): - signature=adapterA[:6] - if signature in seq: - signature_pos=seq.index(signature) - if seq[signature_pos:] in adapterA: - seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) - all_trimmed+=1 - else: - signature=adapterB[:6] - if signature in seq: - #print seq_id,seq,signature; - signature_pos=seq.index(signature) - if seq[signature_pos:] in adapterB: - seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) - all_trimmed+=1 + all_base_before_trim += len(seq) + if f==0 and adapterA!="" : + new_read = RemoveAdapter(seq, adapterA, adapter_mismatch) + if len(new_read)<len(seq) : + all_trimmed += 1 + seq = new_read + if f==1 and adapterB!="" : + new_read = RemoveAdapter(seq, adapterB, adapter_mismatch) + if len(new_read)<len(seq) : + all_trimmed += 1 + seq = new_read + all_base_after_trim += len(seq) - if len(seq) <= 4: + if len(seq)<=4: seq = "N" * (cut2-cut1+1) #--------- trimmed_raw_BS_read ------------------ - original_bs_reads[seq_id] = seq + original_bs_reads[read_id] = seq #--------- FW_C2T ------------------ - outf_FCT.write('>%s\n%s\n' % (seq_id, seq.replace("C","T"))) + outf_FCT.write('>%s\n%s\n' % (read_id, seq.replace("C","T"))) #--------- RC_G2A ------------------ - outf_RCT.write('>%s\n%s\n' % (seq_id, seq.replace("G","A"))) + outf_RGA.write('>%s\n%s\n' % (read_id, seq.replace("G","A"))) - n_list[f]=n + n_list[f] = n outf_FCT.close() - outf_RCT.close() + outf_RGA.close() fileinput.close() - #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); - all_raw_reads+=n + all_raw_reads += n #-------------------------------------------------------------------------------- # Bowtie mapping #-------------------------------------------------------------------------------- - WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) - WC2T_rf=tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) - CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) - CC2T_rf=tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) + WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) + WC2T_rf = tmp_d("W_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) + CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) + CC2T_rf = tmp_d("C_C2T_rf_m"+max_mismatch_no+".mapping"+random_id) run_in_parallel([aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 'input_file_1' : outfile_1FCT, @@ -357,64 +371,62 @@ aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), 'input_file_1' : outfile_1FCT, 'input_file_2' : outfile_2RCT, - 'output_file' : CC2T_fr}, + 'output_file' : CG2A_fr}, aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), - 'input_file_1' : outfile_2FCT, + 'input_file_1' : outfile_2RGA, 'input_file_2' : outfile_1RCT, 'output_file' : WC2T_rf}, aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), - 'input_file_1' : outfile_2FCT, + 'input_file_1' : outfile_2RGA, 'input_file_2' : outfile_1RCT, 'output_file' : CC2T_rf}]) - - delete_files(outfile_1FCT, outfile_2FCT, outfile_1RCT, outfile_2RCT) + delete_files(outfile_1FCT, outfile_2RGA, outfile_1RCT, outfile_2RCT) #-------------------------------------------------------------------------------- # Post processing #-------------------------------------------------------------------------------- - FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) FW_C2T_rf_U, FW_C2T_rf_R = extract_mapping(WC2T_rf) - RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) + RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr) RC_C2T_rf_U, RC_C2T_rf_R = extract_mapping(CC2T_rf) - delete_files(WC2T_fr, WC2T_rf, CC2T_fr, CC2T_rf) + delete_files(WC2T_fr, WC2T_rf, CG2A_fr, CC2T_rf) #---------------------------------------------------------------- # get uniq-hit reads #---------------------------------------------------------------- - Union_set=set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys()) + Union_set = set(FW_C2T_fr_U.iterkeys()) | set(FW_C2T_rf_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) | set(RC_C2T_rf_U.iterkeys()) - Unique_FW_fr_C2T=set() # + - Unique_FW_rf_C2T=set() # + - Unique_RC_fr_C2T=set() # - - Unique_RC_rf_C2T=set() # - - Multiple_hits=set() + Unique_FW_fr_C2T = set() # + + Unique_FW_rf_C2T = set() # + + Unique_RC_fr_G2A = set() # - + Unique_RC_rf_C2T = set() # - + Multiple_hits = set() for x in Union_set: - _list=[] - for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_C2T_fr_U, RC_C2T_rf_U]: - mis_lst=d.get(x,[99]) - mis=int(mis_lst[0]) + _list = [] + for d in [FW_C2T_fr_U, FW_C2T_rf_U, RC_G2A_fr_U, RC_C2T_rf_U]: + mis_lst = d.get(x,[99]) + mis = int(mis_lst[0]) _list.append(mis) - for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_C2T_fr_R, RC_C2T_rf_R]: - mis=d.get(x,99) + for d in [FW_C2T_fr_R, FW_C2T_rf_R, RC_G2A_fr_R, RC_C2T_rf_R]: + mis = d.get(x,99) _list.append(mis) - mini=min(_list) + mini = min(_list) if _list.count(mini)==1: - mini_index=_list.index(mini) + mini_index = _list.index(mini) if mini_index==0: Unique_FW_fr_C2T.add(x) elif mini_index==1: Unique_FW_rf_C2T.add(x) elif mini_index==2: - Unique_RC_fr_C2T.add(x) + Unique_RC_fr_G2A.add(x) elif mini_index==3: Unique_RC_rf_C2T.add(x) else : @@ -424,7 +436,7 @@ # write reads rejected by Multiple Hits to file if show_multiple_hit : - outf_MH=open("Multiple_hit.fa",'w') + outf_MH = open("Multiple_hit.fa",'w') for i in Multiple_hits : outf_MH.write(">%s\n" % i) outf_MH.write("%s\n" % original_bs_reads[i]) @@ -433,32 +445,32 @@ del Union_set del FW_C2T_fr_R del FW_C2T_rf_R - del RC_C2T_fr_R + del RC_G2A_fr_R del RC_C2T_rf_R - FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] - FW_C2T_rf_uniq_lst=[[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T] - RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] - RC_C2T_rf_uniq_lst=[[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T] + FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] + FW_C2T_rf_uniq_lst = [[FW_C2T_rf_U[u][1],u] for u in Unique_FW_rf_C2T] + RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A] + RC_C2T_rf_uniq_lst = [[RC_C2T_rf_U[u][1],u] for u in Unique_RC_rf_C2T] FW_C2T_fr_uniq_lst.sort() FW_C2T_rf_uniq_lst.sort() RC_C2T_fr_uniq_lst.sort() RC_C2T_rf_uniq_lst.sort() - FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] - FW_C2T_rf_uniq_lst=[x[1] for x in FW_C2T_rf_uniq_lst] - RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] - RC_C2T_rf_uniq_lst=[x[1] for x in RC_C2T_rf_uniq_lst] + FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst] + FW_C2T_rf_uniq_lst = [x[1] for x in FW_C2T_rf_uniq_lst] + RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst] + RC_C2T_rf_uniq_lst = [x[1] for x in RC_C2T_rf_uniq_lst] #---------------------------------------------------------------- - numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) - numbers_premapped_lst[1]+=len(Unique_FW_rf_C2T) - numbers_premapped_lst[2]+=len(Unique_RC_fr_C2T) - numbers_premapped_lst[3]+=len(Unique_RC_rf_C2T) + numbers_premapped_lst[0] += len(Unique_FW_fr_C2T) + numbers_premapped_lst[1] += len(Unique_FW_rf_C2T) + numbers_premapped_lst[2] += len(Unique_RC_fr_G2A) + numbers_premapped_lst[3] += len(Unique_RC_rf_C2T) del Unique_FW_fr_C2T del Unique_FW_rf_C2T - del Unique_RC_fr_C2T + del Unique_RC_fr_G2A del Unique_RC_rf_C2T #---------------------------------------------------------------- @@ -466,7 +478,7 @@ nn = 0 for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (FW_C2T_rf_uniq_lst,FW_C2T_rf_U), - (RC_C2T_fr_uniq_lst,RC_C2T_fr_U), + (RC_C2T_fr_uniq_lst,RC_G2A_fr_U), (RC_C2T_rf_uniq_lst,RC_C2T_rf_U)]: nn += 1 @@ -476,15 +488,15 @@ _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] #------------------------------------- - if mapped_chr != mapped_chr0: - my_gseq=deserialize(db_d(mapped_chr)) - chr_length=len(my_gseq) - mapped_chr0=mapped_chr + if mapped_chr!=mapped_chr0: + my_gseq = deserialize(db_d(mapped_chr)) + chr_length = len(my_gseq) + mapped_chr0 = mapped_chr #------------------------------------- - if nn == 1 or nn == 3: + if nn==1 or nn==3 : original_BS_1 = original_bs_reads_1[header] original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) - else: + else : original_BS_1 = original_bs_reads_2[header] original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) @@ -493,83 +505,41 @@ all_mapped += 1 - if nn == 1: # FW-RC mapped to + strand: - + if nn==1 : # FW-RC mapped to + strand: FR = "+FR" - -# mapped_location_1 += 1 -# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] -# origin_genome_1 = origin_genome_long_1[2:-2] mapped_strand_1 = "+" - -# mapped_location_2 += 1 -# origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] -# origin_genome_2 = origin_genome_long_2[2:-2] mapped_strand_2 = "+" - - elif nn==2: # RC-FW mapped to + strand: - -# original_BS_1 = original_bs_reads_2[header] -# original_BS_2 = reverse_compl_seq(original_bs_reads_1[header]) + elif nn==2 : # RC-FW mapped to + strand: FR = "+RF" -# mapped_location_1 += 1 -# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] -# origin_genome_1 = origin_genome_long_1[2:-2] mapped_strand_1 = "+" - -# mapped_location_2 += 1 -# origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] -# origin_genome_2 = origin_genome_long_2[2:-2] mapped_strand_2 = "+" - - - elif nn==3: # FW-RC mapped to - strand: -# original_BS_1=original_bs_reads_1[header] -# original_BS_2=reverse_compl_seq(original_bs_reads_2[header]) - + elif nn==3 : # FW-RC mapped to - strand: FR = "-FR" mapped_location_1 = chr_length - mapped_location_1 - g_len_1 -# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] -# origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) -# origin_genome_1 = origin_genome_long_1[2:-2] mapped_strand_1 = "-" - mapped_location_2 = chr_length - mapped_location_2 - g_len_2 -# origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1 ]) -# origin_genome_2 = origin_genome_long_2[2:-2] mapped_strand_2 = "-" - - elif nn==4: # RC-FW mapped to - strand: -# original_BS_1=original_bs_reads_2[header] -# original_BS_2=reverse_compl_seq(original_bs_reads_1[header]) - + elif nn==4 : # RC-FW mapped to - strand: FR = "-RF" mapped_location_1 = chr_length - mapped_location_1 - g_len_1 -# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] -# origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) -# origin_genome_1 = origin_genome_long_1[2:-2] mapped_strand_1 = "-" - mapped_location_2 = chr_length - mapped_location_2 - g_len_2 -# origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]) -# origin_genome_2 = origin_genome_long_2[2:-2] mapped_strand_2 = "-" - - - origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) - N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides + N_mismatch = max(N_mismatch_1, N_mismatch_2) + mm_no=float(max_mismatch_no) + if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1) + and N_mismatch_2<=mm_no*len(original_BS_2)): - if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no) : all_mapped_passed += 1 numbers_mapped_lst[nn-1] += 1 #---- unmapped ------------------------- @@ -577,43 +547,46 @@ del original_bs_reads_2[header] #--------------------------------------- -# output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] -# output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] - - methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) - methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) + methy_1 = methy_seq(r_aln_1, g_aln_1 + next_1) + methy_2 = methy_seq(r_aln_2, g_aln_2 + next_2) mC_lst, uC_lst = mcounts(methy_1, mC_lst, uC_lst) mC_lst, uC_lst = mcounts(methy_2, mC_lst, uC_lst) - - # #---XS FILTER---------------- - #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 XS_1 = 0 nCH_1 = methy_1.count('y') + methy_1.count('z') nmCH_1 = methy_1.count('Y') + methy_1.count('Z') if( (nmCH_1>XS_count) and nmCH_1/float(nCH_1+nmCH_1)>XS_pct ) : XS_1 = 1 - #XS = 1 if "ZZZ" in methy.replace('-', '') else 0 XS_2 = 0 nCH_2 = methy_2.count('y') + methy_2.count('z') nmCH_2 = methy_2.count('Y') + methy_2.count('Z') if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : XS_2 = 1 + if mapped_strand_1=='+' : + flag_1 = 67 # 1000011 : 1st read, + strand + flag_2 = 131 #10000011 : 2nd read, + strand + else : + flag_1 = 115 # 1110011 : 1st read, - strand + flag_2 = 179 # 10110011 : 2nd read, - strand - outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, + outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) - outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, + all_base_mapped += len(original_BS_1) + + outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) + all_base_mapped += len(original_BS_2) + print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) #---------------------------------------------------------------- # output unmapped pairs #---------------------------------------------------------------- - unmapped_lst=original_bs_reads_1.keys() + unmapped_lst = original_bs_reads_1.keys() unmapped_lst.sort() # for u in unmapped_lst: @@ -623,189 +596,188 @@ all_unmapped += len(unmapped_lst) + # Directional library if asktag=="N": #---------------------------------------------------------------- - outfile_1FCT= tmp_d('Trimed_FCT_1.fa'+random_id) - outfile_2FCT= tmp_d('Trimed_FCT_2.fa'+random_id) + outfile_1FCT = tmp_d('Trimed_FCT_1.fa'+random_id) + outfile_2RGA = tmp_d('Trimed_RGA_2.fa'+random_id) try : - read_inf=open(tmp_d(read_file_1),"r") + if read_file_1.endswith(".gz") : # support input file ending with ".gz" + read_inf = gzip.open(tmp_d(read_file_1), "rb") + else : + read_inf = open(tmp_d(read_file_1),"r") except IOError : print "[Error] Cannot open file : %s", tmp_d(read_file_1) exit(-1) - oneline=read_inf.readline() - l=oneline.split() - input_format="" - - #if len(l)==5: # old solexa format - # input_format="old Solexa Seq file" + oneline = read_inf.readline() + l = oneline.split() + input_format = "" - if oneline[0]=="@": # Illumina GAII FastQ (Lister et al Nature 2009) - input_format="FastQ" - n_fastq=0 - elif len(l)==1 and oneline[0]!=">": # pure sequences - input_format="_list of sequences" - elif len(l)==11: # Illumina GAII qseq file - input_format="Illumina GAII qseq file" - elif oneline[0]==">": # fasta - input_format="fasta" - n_fasta=0 + if oneline[0]=="@" : + input_format = "fastq" + elif len(l)==1 and oneline[0]!=">" : + input_format = "seq" + elif len(l)==11 : + input_format = "qseq" + elif oneline[0]==">" : + input_format = "fasta" read_inf.close() print "Detected data format: %s" % input_format #---------------------------------------------------------------- - read_file_list=[read_file_1,read_file_2] - outfile_FCT_list=[outfile_1FCT,outfile_2FCT] - n_list=[0,0] + read_file_list = [read_file_1,read_file_2] + outfile_FCT_list = [outfile_1FCT,outfile_2RGA] + n_list = [0,0] for f in range(2): - read_file=read_file_list[f] - outf_FCT=open(outfile_FCT_list[f],'w') + read_file = read_file_list[f] + outf_FCT = open(outfile_FCT_list[f],'w') original_bs_reads = original_bs_reads_lst[f] - n=n_list[f] + n = n_list[f] - seq_id="" - seq="" - seq_ready="N" + read_id = "" + seq = "" + seq_ready = "N" + line_no = 0 for line in fileinput.input(tmp_d(read_file)): - l=line.split() - if input_format=="old Solexa Seq file": - n+=1 - seq_id=str(n) - seq_id=seq_id.zfill(12) - seq=l[4] - seq_ready="Y" - elif input_format=="_list of sequences": - n+=1 - seq_id=str(n) - seq_id=seq_id.zfill(12) - seq=l[0] - seq_ready="Y" - elif input_format=="FastQ": - m_fastq=math.fmod(n_fastq,4) - n_fastq+=1 - seq_ready="N" - if m_fastq==0: - n+=1 - seq_id=str(n) - seq_id=seq_id.zfill(12) - seq="" - elif m_fastq==1: - seq=l[0] - seq_ready="Y" - else: - seq="" - elif input_format=="Illumina GAII qseq file": - n+=1 - seq_id=str(n) - seq_id=seq_id.zfill(12) - seq=l[8] - seq_ready="Y" + l = line.split() + line_no += 1 + if input_format=="seq": + n += 1 + read_id = str(n) + read_id = read_id.zfill(12) + if f == 1 : + read_id_lst_1[read_id] = read_id + else : + read_id_lst_2[read_id] = read_id + seq = l[0] + seq_ready = "Y" + elif input_format=="fastq": + l_fastq = math.fmod(line_no, 4) + if l_fastq==1 : + all_raw_reads += 1 + read_id = str(line_no/4+1).zfill(12) + if f==1 : + read_id_lst_1[read_id] = l[0][1:] + else : + read_id_lst_2[read_id] = l[0][1:] + seq_ready = "N" + elif l_fastq==2 : + seq = l[0] + seq_ready = "Y" + else : + seq = "" + seq_ready = "N" + elif input_format=="qseq": + all_raw_reads += 1 + read_id = str(line_no) + read_id = read_id.zfill(12) + if f == 1 : + read_id_lst_1[read_id] = l[0][1:] + else : + read_id_lst_2[read_id] = l[0][1:] + seq = l[8] + seq_ready = "Y" elif input_format=="fasta": - m_fasta=math.fmod(n_fasta,2) - n_fasta+=1 - seq_ready="N" - if m_fasta==0: - n+=1 - seq_id=l[0][1:] - seq_id=seq_id.zfill(17) - seq="" - elif m_fasta==1: - seq=l[0] - seq_ready="Y" - else: - seq="" + l_fasta = math.fmod(line_no,2) + if l_fasta==1: + seq_ready = "N" + all_raw_reads += 1 + read_id = str(line_no/2+1).zfill(12) + if f == 1 : + read_id_lst_1[read_id] = l[0][1:] + else : + read_id_lst_2[read_id] = l[0][1:] + seq = "" + elif l_fasta==0 : + seq = l[0] + seq_ready = "Y" #---------------------------------------------------------------- if seq_ready=="Y": - seq=seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52 - seq=seq.upper() - seq=seq.replace(".","N") + seq = seq[cut1-1:cut2] #<----------------------selecting 0..52 from 1..72 -e 52 + seq = seq.upper() + seq = seq.replace(".","N") #--striping BS adapter from 3' read -------------------------------------------------------------- - if (adapterA !="") and (adapterB !=""): - signature=adapterA[:6] - if signature in seq: - signature_pos=seq.index(signature) - if seq[signature_pos:] in adapterA: - seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) - all_trimmed+=1 - else: - signature=adapterB[:6] - if signature in seq: - #print seq_id,seq,signature; - signature_pos=seq.index(signature) - if seq[signature_pos:] in adapterB: - seq=seq[:signature_pos]#+"".join(["N" for x in range(len(seq)-len(signature_pos))]) - all_trimmed+=1 + all_base_before_trim += len(seq) + if f==0 and adapterA!="" : + new_read = RemoveAdapter(seq, adapterA, adapter_mismatch) + if len(new_read) < len(seq) : + all_trimmed += 1 + seq = new_read + if f==1 and adapterB!="" : + new_read = RemoveAdapter(seq, adapterB, adapter_mismatch) + if len(new_read)<len(seq) : + all_trimmed += 1 + seq = new_read + all_base_after_trim += len(seq) - if len(seq) <= 4: + if len(seq)<=4: seq = "N" * (cut2-cut1+1) #--------- trimmed_raw_BS_read ------------------ - original_bs_reads[seq_id] = seq + original_bs_reads[read_id] = seq #--------- FW_C2T ------------------ if f==0: - outf_FCT.write('>%s\n%s\n'% (seq_id, seq.replace("C","T"))) + outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("C","T"))) elif f==1: - outf_FCT.write('>%s\n%s\n'% (seq_id, reverse_compl_seq(seq).replace("C","T"))) + outf_FCT.write('>%s\n%s\n'% (read_id, seq.replace("G","A"))) - - n_list[f]=n + n_list[f] = n outf_FCT.close() - fileinput.close() - #print "All input end 1: %d , end 2: %d "%(n_list[0],n_list[1]); - all_raw_reads+=n + all_raw_reads += n #-------------------------------------------------------------------------------- # Bowtie mapping #-------------------------------------------------------------------------------- - WC2T_fr=tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) - CC2T_fr=tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) + WC2T_fr = tmp_d("W_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) + CG2A_fr = tmp_d("C_C2T_fr_m"+max_mismatch_no+".mapping"+random_id) run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), 'input_file_1' : outfile_1FCT, - 'input_file_2' : outfile_2FCT, + 'input_file_2' : outfile_2RGA, 'output_file' : WC2T_fr}, aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), 'input_file_1' : outfile_1FCT, - 'input_file_2' : outfile_2FCT, - 'output_file' : CC2T_fr} ]) + 'input_file_2' : outfile_2RGA, + 'output_file' : CG2A_fr} ]) - delete_files(outfile_1FCT, outfile_2FCT) + delete_files(outfile_1FCT, outfile_2RGA) #-------------------------------------------------------------------------------- # Post processing #-------------------------------------------------------------------------------- FW_C2T_fr_U, FW_C2T_fr_R = extract_mapping(WC2T_fr) - RC_C2T_fr_U, RC_C2T_fr_R = extract_mapping(CC2T_fr) + RC_G2A_fr_U, RC_G2A_fr_R = extract_mapping(CG2A_fr) #---------------------------------------------------------------- # get uniq-hit reads #---------------------------------------------------------------- - Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_C2T_fr_U.iterkeys()) + Union_set = set(FW_C2T_fr_U.iterkeys()) | set(RC_G2A_fr_U.iterkeys()) Unique_FW_fr_C2T = set() # + - Unique_RC_fr_C2T = set() # - + Unique_RC_fr_G2A = set() # - Multiple_hits=set() for x in Union_set: _list = [] - for d in [FW_C2T_fr_U, RC_C2T_fr_U]: + for d in [FW_C2T_fr_U, RC_G2A_fr_U]: mis_lst = d.get(x,[99]) mis = int(mis_lst[0]) _list.append(mis) - for d in [FW_C2T_fr_R, RC_C2T_fr_R]: + for d in [FW_C2T_fr_R, RC_G2A_fr_R]: mis = d.get(x,99) _list.append(mis) mini = min(_list) @@ -814,7 +786,7 @@ if mini_index == 0: Unique_FW_fr_C2T.add(x) elif mini_index == 1: - Unique_RC_fr_C2T.add(x) + Unique_RC_fr_G2A.add(x) else : Multiple_hits.add(x) else : @@ -822,35 +794,34 @@ # write reads rejected by Multiple Hits to file if show_multiple_hit : - outf_MH=open("Multiple_hit.fa",'w') + outf_MH = open("Multiple_hit.fa",'w') for i in Multiple_hits : outf_MH.write(">%s\n" % i) outf_MH.write("%s\n" % original_bs_reads[i]) outf_MH.close() - FW_C2T_fr_uniq_lst=[[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] - RC_C2T_fr_uniq_lst=[[RC_C2T_fr_U[u][1],u] for u in Unique_RC_fr_C2T] + FW_C2T_fr_uniq_lst = [[FW_C2T_fr_U[u][1],u] for u in Unique_FW_fr_C2T] + RC_C2T_fr_uniq_lst = [[RC_G2A_fr_U[u][1],u] for u in Unique_RC_fr_G2A] FW_C2T_fr_uniq_lst.sort() RC_C2T_fr_uniq_lst.sort() - FW_C2T_fr_uniq_lst=[x[1] for x in FW_C2T_fr_uniq_lst] - RC_C2T_fr_uniq_lst=[x[1] for x in RC_C2T_fr_uniq_lst] + FW_C2T_fr_uniq_lst = [x[1] for x in FW_C2T_fr_uniq_lst] + RC_C2T_fr_uniq_lst = [x[1] for x in RC_C2T_fr_uniq_lst] #---------------------------------------------------------------- - numbers_premapped_lst[0]+=len(Unique_FW_fr_C2T) - numbers_premapped_lst[1]+=len(Unique_RC_fr_C2T) + numbers_premapped_lst[0] += len(Unique_FW_fr_C2T) + numbers_premapped_lst[1] += len(Unique_RC_fr_G2A) #---------------------------------------------------------------- nn = 0 - for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_C2T_fr_U)]: + for ali_unique_lst, ali_dic in [(FW_C2T_fr_uniq_lst,FW_C2T_fr_U), (RC_C2T_fr_uniq_lst,RC_G2A_fr_U)]: nn += 1 mapped_chr0 = "" for header in ali_unique_lst: _, mapped_chr, mapped_location_1, cigar1, mapped_location_2, cigar2 = ali_dic[header] - #------------------------------------- if mapped_chr != mapped_chr0: my_gseq = deserialize(db_d(mapped_chr)) @@ -860,6 +831,7 @@ original_BS_1 = original_bs_reads_1[header] original_BS_2 = reverse_compl_seq(original_bs_reads_2[header]) + #original_BS_2 = original_bs_reads_2[header] r_start_1, r_end_1, g_len_1 = get_read_start_end_and_genome_length(cigar1) r_start_2, r_end_2, g_len_2 = get_read_start_end_and_genome_length(cigar2) @@ -868,43 +840,30 @@ if nn == 1: # FW-RC mapped to + strand: FR = "+FR" -# mapped_location_1 += 1 -# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] -# origin_genome_1 = origin_genome_long_1[2:-2] mapped_strand_1 = "+" - -# mapped_location_2 += 1 -# origin_genome_long_2 = my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1] -# origin_genome_2 = origin_genome_long_2[2:-2] mapped_strand_2 = "+" - elif nn == 2: # FW-RC mapped to - strand: - FR="-FR" mapped_location_1 = chr_length - mapped_location_1 - g_len_1 -# origin_genome_long_1 = my_gseq[mapped_location_1 - 2 - 1 : mapped_location_1 + g_len_1 + 2 - 1] -# origin_genome_long_1 = reverse_compl_seq(origin_genome_long_1) -# origin_genome_1 = origin_genome_long_1[2:-2] mapped_strand_1 = "-" - mapped_location_2 = chr_length - mapped_location_2 - g_len_2 -# origin_genome_long_2 = reverse_compl_seq(my_gseq[mapped_location_2 - 2 - 1 : mapped_location_2 + g_len_2 + 2 - 1]) -# origin_genome_2 = origin_genome_long_2[2:-2] mapped_strand_2 = "-" - - origin_genome_1, next_1, output_genome_1 = get_genomic_sequence(my_gseq, mapped_location_1, mapped_location_1 + g_len_1, mapped_strand_1) origin_genome_2, next_2, output_genome_2 = get_genomic_sequence(my_gseq, mapped_location_2, mapped_location_2 + g_len_2, mapped_strand_2) - r_aln_1, g_aln_1 = cigar_to_alignment(cigar1, original_BS_1, origin_genome_1) r_aln_2, g_aln_2 = cigar_to_alignment(cigar2, original_BS_2, origin_genome_2) N_mismatch_1 = N_MIS(r_aln_1, g_aln_1) #+ original_BS_length_1 - (r_end_1 - r_start_1) # mismatches in the alignment + soft clipped nucleotides N_mismatch_2 = N_MIS(r_aln_2, g_aln_2) #+ original_BS_length_2 - (r_end_2 - r_start_2) # mismatches in the alignment + soft clipped nucleotides - if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no): +# if max(N_mismatch_1, N_mismatch_2) <= int(max_mismatch_no): +# if N_mismatch <= int(max_mismatch_no) : + N_mismatch = max(N_mismatch_1, N_mismatch_2) + mm_no=float(max_mismatch_no) + if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch_1<=mm_no*len(original_BS_1) + and N_mismatch_2<=mm_no*len(original_BS_2)): numbers_mapped_lst[nn-1] += 1 all_mapped_passed += 1 @@ -914,8 +873,6 @@ del original_bs_reads_2[header] #--------------------------------------- -# output_genome_1 = origin_genome_long_1[0:2] + "_" + origin_genome_1 + "_" + origin_genome_long_1[-2:] -# output_genome_2 = origin_genome_long_2[0:2] + "_" + origin_genome_2 + "_" + origin_genome_long_2[-2:] methy_1=methy_seq(r_aln_1, g_aln_1 + next_1) methy_2=methy_seq(r_aln_2, g_aln_2 + next_2) mC_lst,uC_lst = mcounts(methy_1, mC_lst, uC_lst) @@ -936,11 +893,20 @@ if( (nmCH_2>XS_count) and nmCH_2/float(nCH_2+nmCH_2)>XS_pct ) : XS_2 = 1 + if mapped_strand_1 == '+' : + flag_1 = 67 # 1000011 : 1st read, + strand + flag_2 = 131 # 10000011 : 2nd read, + strand + else : + flag_1 = 115 # 1110011 : 1st read, - strand + flag_2 = 179 # 10110011 : 2nd read, - strand - outfile.store(header, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, + outfile.store2( read_id_lst_1[header], flag_1, N_mismatch_1, FR, mapped_chr, mapped_strand_1, mapped_location_1, cigar1, original_BS_1, methy_1, XS_1, output_genome = output_genome_1, rnext = mapped_chr, pnext = mapped_location_2) - outfile.store(header, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, + all_base_mapped += len(original_BS_1) + + outfile.store2( read_id_lst_2[header], flag_2, N_mismatch_2, FR, mapped_chr, mapped_strand_2, mapped_location_2, cigar2, original_BS_2, methy_2, XS_2, output_genome = output_genome_2, rnext = mapped_chr, pnext = mapped_location_1) + all_base_mapped += len(original_BS_2) print "--> %s %s (%d/%d) " % (read_file_1, read_file_2, no_my_files, len(my_files)) #---------------------------------------------------------------- @@ -956,45 +922,40 @@ all_unmapped+=len(unmapped_lst) - #================================================================================================== -# outf.close() -# -# outf_u1.close() -# outf_u2.close() delete_files(tmp_path) logm("-------------------------------- " ) - logm("O Number of raw BS-read pairs: %d ( %d bp)"%(all_raw_reads,cut2-cut1+1) ) - logm("O Number of ends trimmed for adapter: %d"% all_trimmed+"\n") + logm("Number of raw BS-read pairs: %d " %(all_raw_reads/2) ) + if adapterA != "" or adapterB != "" : + logm("Number of reads having adapter removed: %d \n"% all_trimmed) + logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) ) if all_raw_reads >0: - logm("O Number of unique-hits read pairs for post-filtering: %d" % all_mapped + "\n") + logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) + logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped + "\n") if asktag=="Y": - logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) - logm("O -- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) - logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) - logm("O -- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) + logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) + logm("-- %7d RC-FW pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) + logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) + logm("-- %7d RC-FW pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) elif asktag=="N": - logm("O -- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) - logm("O -- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) - + logm("-- %7d FW-RC pairs mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) + logm("-- %7d FW-RC pairs mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) + logm("--- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) + if asktag=="Y": + logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) + logm("----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) ) + logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) ) + logm("----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) ) + elif asktag=="N": + logm("----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) + logm("----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) ) - logm("O --- Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) - logm("O --- %d uniquely aligned pairs, where each end has mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) - if asktag=="Y": - logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) - logm("O ----- %7d RC-FW pairs mapped to Watson strand"%(numbers_mapped_lst[1]) ) - logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[2]) ) - logm("O ----- %7d RC-FW pairs mapped to Crick strand"%(numbers_mapped_lst[3]) ) - elif asktag=="N": - logm("O ----- %7d FW-RC pairs mapped to Watson strand"%(numbers_mapped_lst[0]) ) - logm("O ----- %7d FW-RC pairs mapped to Crick strand"%(numbers_mapped_lst[1]) ) - - logm("O Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) - logm("O Unmapped read pairs: %d"% all_unmapped+"\n") - + logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)*2/all_raw_reads) ) + logm("Total bases of uniquely mapped reads %7d"% all_base_mapped ) + logm("Unmapped read pairs: %d"% all_unmapped+"\n") n_CG=mC_lst[0]+uC_lst[0] n_CHG=mC_lst[1]+uC_lst[1]