Mercurial > repos > weilong-guo > bs_seeker2
diff BSseeker2/bs_align/bs_single_end.py @ 0:e6df770c0e58 draft
Initial upload
author | weilong-guo |
---|---|
date | Fri, 12 Jul 2013 18:47:28 -0400 |
parents | |
children | 8b26adf64adc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BSseeker2/bs_align/bs_single_end.py Fri Jul 12 18:47:28 2013 -0400 @@ -0,0 +1,732 @@ +import fileinput, os, time, random, math +from bs_utils.utils import * +from bs_align_utils import * + +#---------------------------------------------------------------- +# Read from the mapped results, return lists of unique / multiple-hit reads +# The function suppose at most 2 hits will be reported in single file +def extract_mapping(ali_file): + unique_hits = {} + non_unique_hits = {} + + header0 = "" + lst = [] + + for header, chr, location, no_mismatch, cigar in process_aligner_output(ali_file): + #------------------------------ + if header != header0: + #---------- output ----------- + if len(lst) == 1: + unique_hits[header0] = lst[0] # [no_mismatch, chr, location] + elif len(lst) > 1: + min_lst = min(lst, key = lambda x: x[0]) + max_lst = max(lst, key = lambda x: x[0]) + + if min_lst[0] < max_lst[0]: + unique_hits[header0] = min_lst + else: + non_unique_hits[header0] = min_lst[0] + #print "multiple hit", header, chr, location, no_mismatch, cigar # test + header0 = header + lst = [(no_mismatch, chr, location, cigar)] + else: # header == header0, same header (read id) + lst.append((no_mismatch, chr, location, cigar)) + + if len(lst) == 1: + unique_hits[header0] = lst[0] # [no_mismatch, chr, location] + elif len(lst) > 1: + min_lst = min(lst, key = lambda x: x[0]) + max_lst = max(lst, key = lambda x: x[0]) + + if min_lst[0] < max_lst[0]: + unique_hits[header0] = min_lst + else: + non_unique_hits[header0] = min_lst[0] + + + return unique_hits, non_unique_hits + + +def bs_single_end(main_read_file, asktag, adapter_file, cut1, cut2, no_small_lines, + max_mismatch_no, aligner_command, db_path, tmp_path, outfile, + XS_pct, XS_count, adapter_mismatch, show_multiple_hit=False): + #---------------------------------------------------------------- + # adapter : strand-specific or not + adapter="" + adapter_fw="" + adapter_rc="" + if adapter_file !="": + try : + adapter_inf=open(adapter_file,"r") + except IOError: + print "[Error] Cannot open adapter file : %s" % adapter_file + exit(-1) + if asktag == "N": #<--- directional library + adapter=adapter_inf.readline() + adapter_inf.close() + adapter=adapter.rstrip("\n") + elif asktag == "Y":#<--- un-directional library + adapter_fw=adapter_inf.readline() + adapter_rc=adapter_inf.readline() + adapter_inf.close() + adapter_fw=adapter_fw.rstrip("\n") + adapter_rc=adapter_rc.rstrip("\n") + adapter_inf.close() + #---------------------------------------------------------------- + + + + #---------------------------------------------------------------- + logm("Read filename: %s"% main_read_file ) + logm("Un-directional library: %s" % asktag ) + logm("The first base (for mapping): %d" % cut1) + logm("The last base (for mapping): %d" % cut2) + logm("Max. lines per mapping: %d"% no_small_lines) + logm("Aligner: %s" % aligner_command) + logm("Reference genome library path: %s" % db_path ) + logm("Number of mismatches allowed: %s" % max_mismatch_no ) + if adapter_file !="": + if asktag=="N": + logm("Adapter to be removed from 3' reads: %s"%(adapter.rstrip("\n"))) + elif asktag=="Y": + logm("Adapter to be removed from 3' FW reads: %s"%(adapter_fw.rstrip("\n")) ) + logm("Adapter to be removed from 3' RC reads: %s"%(adapter_rc.rstrip("\n")) ) + #---------------------------------------------------------------- + + # helper method to join fname with tmp_path + tmp_d = lambda fname: os.path.join(tmp_path, fname) + + db_d = lambda fname: os.path.join(db_path, fname) + + #---------------------------------------------------------------- + # splitting the big read file + + input_fname = os.path.split(main_read_file)[1] + +# split_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines) +# my_files = sorted(splitted_file for splitted_file in os.listdir(tmp_path) +# if splitted_file.startswith("%s-s-" % input_fname)) + + #---- Stats ------------------------------------------------------------ + all_raw_reads=0 + all_trimed=0 + all_mapped=0 + all_mapped_passed=0 + + numbers_premapped_lst=[0,0,0,0] + numbers_mapped_lst=[0,0,0,0] + + mC_lst=[0,0,0] + uC_lst=[0,0,0] + + + no_my_files=0 + + #---------------------------------------------------------------- + logm("== Start mapping ==") + + for read_file in isplit_file(main_read_file, tmp_d(input_fname)+'-s-', no_small_lines): +# for read_file in my_files: + original_bs_reads = {} + no_my_files+=1 + random_id = ".tmp-"+str(random.randint(1000000,9999999)) + + #------------------------------------------------------------------- + # undirectional sequencing + #------------------------------------------------------------------- + if asktag=="Y": + + #---------------------------------------------------------------- + outfile2=tmp_d('Trimmed_C2T.fa'+random_id) + outfile3=tmp_d('Trimmed_G2A.fa'+random_id) + + outf2=open(outfile2,'w') + outf3=open(outfile3,'w') + + #---------------------------------------------------------------- + # detect format of input file + try : + read_inf=open(read_file,"r") + except IOError : + print "[Error] Cannot open input file : %s" % read_file + exit(-1) + + oneline=read_inf.readline() + l=oneline.split() + input_format="" + if oneline[0]=="@": # fastq + input_format="fastq" + n_fastq=0 + elif len(l)==1 and oneline[0]!=">": # pure sequences + input_format="seq" + elif len(l)==11: # qseq + input_format="qseq" + elif oneline[0]==">": # fasta + input_format="fasta" + n_fasta=0 + read_inf.close() + + #---------------------------------------------------------------- + # read sequence, remove adapter and convert + read_id="" + seq="" + seq_ready="N" + for line in fileinput.input(read_file): + l=line.split() + + if input_format=="seq": + all_raw_reads+=1 + read_id=str(all_raw_reads) + read_id=read_id.zfill(12) + seq=l[0] + seq_ready="Y" + elif input_format=="fastq": + m_fastq=math.fmod(n_fastq,4) + n_fastq+=1 + seq_ready="N" + if m_fastq==0: + all_raw_reads+=1 + read_id=str(all_raw_reads) + read_id=read_id.zfill(12) + seq="" + elif m_fastq==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + elif input_format=="qseq": + all_raw_reads+=1 + read_id=str(all_raw_reads) + read_id=read_id.zfill(12) + seq=l[8] + seq_ready="Y" + elif input_format=="fasta": + m_fasta=math.fmod(n_fasta,2) + n_fasta+=1 + seq_ready="N" + if m_fasta==0: + all_raw_reads+=1 + #read_id=str(all_raw_reads) + read_id=l[0][1:] + seq="" + elif m_fasta==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + + #---------------------------------------------------------------- + if seq_ready=="Y": + seq=seq[cut1-1:cut2] #<---- selecting 0..52 from 1..72 -e 52 + seq=seq.upper() + seq=seq.replace(".","N") + + # striping BS adapter from 3' read + if (adapter_fw !="") and (adapter_rc !="") : + new_read = RemoveAdapter(seq, adapter_fw, adapter_mismatch) + new_read = Remove_5end_Adapter(new_read, adapter_rc) + if len(new_read) < len(seq) : + all_trimed += 1 + seq = new_read + + if len(seq)<=4: + seq=''.join(["N" for x in xrange(cut2-cut1+1)]) + + #--------- trimmed_raw_BS_read ------------------ + original_bs_reads[read_id] = seq + + #--------- FW_C2T ------------------ + outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T"))) + #--------- RC_G2A ------------------ + outf3.write('>%s\n%s\n' % (read_id, seq.replace("G","A"))) + + fileinput.close() + + outf2.close() + outf3.close() + + delete_files(read_file) + + #-------------------------------------------------------------------------------- + # Bowtie mapping + #------------------------------------------------------------------------------- + WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) + CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) + WG2A=tmp_d("W_G2A_m"+max_mismatch_no+".mapping"+random_id) + CG2A=tmp_d("C_G2A_m"+max_mismatch_no+".mapping"+random_id) + + # print aligner_command % {'int_no_mismatches' : int_no_mismatches, + # 'reference_genome' : os.path.join(db_path,'W_C2T'), + # 'input_file' : outfile2, + # 'output_file' : WC2T} + + run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), + 'input_file' : outfile2, + 'output_file' : WC2T}, + + aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), + 'input_file' : outfile2, + 'output_file' : CC2T}, + + aligner_command % {'reference_genome' : os.path.join(db_path,'W_G2A'), + 'input_file' : outfile3, + 'output_file' : WG2A}, + + aligner_command % {'reference_genome' : os.path.join(db_path,'C_G2A'), + 'input_file' : outfile3, + 'output_file' : CG2A} ]) + + + delete_files(outfile2, outfile3) + + + #-------------------------------------------------------------------------------- + # Post processing + #-------------------------------------------------------------------------------- + + FW_C2T_U,FW_C2T_R=extract_mapping(WC2T) + RC_G2A_U,RC_G2A_R=extract_mapping(CG2A) + + FW_G2A_U,FW_G2A_R=extract_mapping(WG2A) + RC_C2T_U,RC_C2T_R=extract_mapping(CC2T) + + #---------------------------------------------------------------- + # get unique-hit reads + #---------------------------------------------------------------- + Union_set=set(FW_C2T_U.iterkeys()) | set(RC_G2A_U.iterkeys()) | set(FW_G2A_U.iterkeys()) | set(RC_C2T_U.iterkeys()) + + Unique_FW_C2T=set() # + + Unique_RC_G2A=set() # + + Unique_FW_G2A=set() # - + Unique_RC_C2T=set() # - + Multiple_hits=set() + + + for x in Union_set: + _list=[] + for d in [FW_C2T_U, RC_G2A_U, FW_G2A_U, RC_C2T_U]: + mis_lst=d.get(x,[99]) + mis=int(mis_lst[0]) + _list.append(mis) + for d in [FW_C2T_R, RC_G2A_R, FW_G2A_R, RC_C2T_R]: + mis=d.get(x,99) + _list.append(mis) + mini=min(_list) + if _list.count(mini) == 1: + mini_index=_list.index(mini) + if mini_index == 0: + Unique_FW_C2T.add(x) + elif mini_index == 1: + Unique_RC_G2A.add(x) + elif mini_index == 2: + Unique_FW_G2A.add(x) + elif mini_index == 3: + Unique_RC_C2T.add(x) + # if mini_index = 4,5,6,7, indicating multiple hits + else : + Multiple_hits.add(x) + else : + Multiple_hits.add(x) + # write reads rejected by Multiple Hits to file + if show_multiple_hit : + outf_MH=open("Multiple_hit.fa",'w') + for i in Multiple_hits : + outf_MH.write(">%s\n" % i) + outf_MH.write("%s\n" % original_bs_reads[i]) + outf_MH.close() + + del Union_set + del FW_C2T_R + del FW_G2A_R + del RC_C2T_R + del RC_G2A_R + + FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] + FW_G2A_uniq_lst=[[FW_G2A_U[u][1],u] for u in Unique_FW_G2A] + RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] + RC_G2A_uniq_lst=[[RC_G2A_U[u][1],u] for u in Unique_RC_G2A] + FW_C2T_uniq_lst.sort() + RC_C2T_uniq_lst.sort() + FW_G2A_uniq_lst.sort() + RC_G2A_uniq_lst.sort() + FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] + RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] + FW_G2A_uniq_lst=[x[1] for x in FW_G2A_uniq_lst] + RC_G2A_uniq_lst=[x[1] for x in RC_G2A_uniq_lst] + + del Unique_FW_C2T + del Unique_FW_G2A + del Unique_RC_C2T + del Unique_RC_G2A + + #---------------------------------------------------------------- + numbers_premapped_lst[0] += len(Unique_FW_C2T) + numbers_premapped_lst[1] += len(Unique_RC_G2A) + numbers_premapped_lst[2] += len(Unique_FW_G2A) + numbers_premapped_lst[3] += len(Unique_RC_C2T) + + + #---------------------------------------------------------------- + + nn=0 + gseq = dict() + chr_length = dict() + for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U), + (RC_G2A_uniq_lst,RC_G2A_U), + (FW_G2A_uniq_lst,FW_G2A_U), + (RC_C2T_uniq_lst,RC_C2T_U)]: + nn += 1 + mapped_chr0 = "" + + for header in ali_unique_lst: + + _, mapped_chr, mapped_location, cigar = ali_dic[header] + + original_BS = original_bs_reads[header] + #------------------------------------- + if mapped_chr not in gseq: + gseq[mapped_chr] = deserialize(db_d(mapped_chr)) + chr_length[mapped_chr] = len(gseq[mapped_chr]) + + if nn == 2 or nn == 3: + cigar = list(reversed(cigar)) + r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) + + + all_mapped += 1 + + if nn == 1: # +FW mapped to + strand: + FR = "+FW" + mapped_strand="+" + + elif nn == 2: # +RC mapped to + strand: + FR = "+RC" # RC reads from -RC reflecting the methylation status on Watson strand (+) + mapped_location = chr_length[mapped_chr] - mapped_location - g_len + mapped_strand = "+" + original_BS = reverse_compl_seq(original_BS) # for RC reads + + elif nn == 3: # -RC mapped to - strand: + mapped_strand = "-" + FR = "-RC" # RC reads from +RC reflecting the methylation status on Crick strand (-) + original_BS = reverse_compl_seq(original_BS) # for RC reads + + elif nn == 4: # -FW mapped to - strand: + mapped_strand = "-" + FR = "-FW" + mapped_location = chr_length[mapped_chr] - mapped_location - g_len + + origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) + + r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) + + + if len(r_aln)==len(g_aln): + N_mismatch = N_MIS(r_aln, g_aln) + if N_mismatch <= int(max_mismatch_no): + numbers_mapped_lst[nn-1] += 1 + all_mapped_passed += 1 + methy = methy_seq(r_aln, g_aln + next) + mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) + + #---XS FILTER---------------- + XS = 0 + nCH = methy.count('y') + methy.count('z') + nmCH = methy.count('Y') + methy.count('Z') + if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : + XS = 1 + + outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) + + #---------------------------------------------------------------- + logm("--> %s (%d) "%(read_file, no_my_files)) + delete_files(WC2T, WG2A, CC2T, CG2A) + + + + #-------------------------------------------------------------------- + # directional sequencing + #-------------------------------------------------------------------- + + if asktag=="N": + #---------------------------------------------------------------- + outfile2=tmp_d('Trimed_C2T.fa'+random_id) + outf2=open(outfile2,'w') + + n=0 + #---------------------------------------------------------------- + try : + read_inf=open(read_file,"r") + except IOError : + print "[Error] Cannot open input file : %s" % read_file + exit(-1) + + oneline=read_inf.readline() + l=oneline.split() + input_format="" + if oneline[0]=="@": # FastQ + input_format="fastq" + n_fastq=0 + elif len(l)==1 and oneline[0]!=">": # pure sequences + input_format="seq" + elif len(l)==11: # Illumina GAII qseq file + input_format="qseq" + elif oneline[0]==">": # fasta + input_format="fasta" + n_fasta=0 + read_inf.close() + #print "detected data format: %s"%(input_format) + #---------------------------------------------------------------- + read_id="" + seq="" + seq_ready="N" + for line in fileinput.input(read_file): + l=line.split() + if input_format=="seq": + all_raw_reads+=1 + read_id=str(all_raw_reads) + read_id=read_id.zfill(12) + seq=l[0] + seq_ready="Y" + elif input_format=="fastq": + m_fastq=math.fmod(n_fastq,4) + n_fastq+=1 + seq_ready="N" + if m_fastq==0: + all_raw_reads+=1 + read_id=str(all_raw_reads) + read_id=read_id.zfill(12) + seq="" + elif m_fastq==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + elif input_format=="qseq": + all_raw_reads+=1 + read_id=str(all_raw_reads) + read_id=read_id.zfill(12) + seq=l[8] + seq_ready="Y" + elif input_format=="fasta": + m_fasta=math.fmod(n_fasta,2) + n_fasta+=1 + seq_ready="N" + if m_fasta==0: + all_raw_reads+=1 + read_id=l[0][1:] + seq="" + elif m_fasta==1: + seq=l[0] + seq_ready="Y" + else: + seq="" + + #-------------------------------- + if seq_ready=="Y": + seq=seq[cut1-1:cut2] #<---selecting 0..52 from 1..72 -e 52 + seq=seq.upper() + seq=seq.replace(".","N") + + #--striping adapter from 3' read ------- + if adapter != "": + new_read = RemoveAdapter(seq, adapter, adapter_mismatch) + if len(new_read) < len(seq) : + all_trimed += 1 + seq = new_read + + if len(seq)<=4: + seq = "N" * (cut2-cut1+1) + + #--------- trimmed_raw_BS_read ------------------ + original_bs_reads[read_id] = seq + + + #--------- FW_C2T ------------------ + outf2.write('>%s\n%s\n' % (read_id, seq.replace("C","T"))) + + fileinput.close() + + outf2.close() + delete_files(read_file) + + #-------------------------------------------------------------------------------- + # Bowtie mapping + #-------------------------------------------------------------------------------- + WC2T=tmp_d("W_C2T_m"+max_mismatch_no+".mapping"+random_id) + CC2T=tmp_d("C_C2T_m"+max_mismatch_no+".mapping"+random_id) + + run_in_parallel([ aligner_command % {'reference_genome' : os.path.join(db_path,'W_C2T'), + 'input_file' : outfile2, + 'output_file' : WC2T}, + aligner_command % {'reference_genome' : os.path.join(db_path,'C_C2T'), + 'input_file' : outfile2, + 'output_file' : CC2T} ]) + + delete_files(outfile2) + + #-------------------------------------------------------------------------------- + # Post processing + #-------------------------------------------------------------------------------- + + + FW_C2T_U, FW_C2T_R = extract_mapping(WC2T) + RC_C2T_U, RC_C2T_R = extract_mapping(CC2T) + + #---------------------------------------------------------------- + # get uniq-hit reads + #---------------------------------------------------------------- + Union_set = set(FW_C2T_U.iterkeys()) | set(RC_C2T_U.iterkeys()) + + Unique_FW_C2T = set() # + + Unique_RC_C2T = set() # - + Multiple_hits=set() + # write reads rejected by Multiple Hits to file + + for x in Union_set: + _list=[] + for d in [FW_C2T_U,RC_C2T_U]: + mis_lst=d.get(x,[99]) + mis=int(mis_lst[0]) + _list.append(mis) + for d in [FW_C2T_R,RC_C2T_R]: + mis=d.get(x,99) + _list.append(mis) + mini=min(_list) + #print _list + if _list.count(mini)==1: + mini_index=_list.index(mini) + if mini_index==0: + Unique_FW_C2T.add(x) + elif mini_index==1: + Unique_RC_C2T.add(x) + else: + Multiple_hits.add(x) + else : + Multiple_hits.add(x) + # write reads rejected by Multiple Hits to file + if show_multiple_hit : + outf_MH=open("Multiple_hit.fa",'w') + for i in Multiple_hits : + outf_MH.write(">%s\n" % i) + outf_MH.write("%s\n" % original_bs_reads[i]) + outf_MH.close() + + + + FW_C2T_uniq_lst=[[FW_C2T_U[u][1],u] for u in Unique_FW_C2T] + RC_C2T_uniq_lst=[[RC_C2T_U[u][1],u] for u in Unique_RC_C2T] + FW_C2T_uniq_lst.sort() + RC_C2T_uniq_lst.sort() + FW_C2T_uniq_lst=[x[1] for x in FW_C2T_uniq_lst] + RC_C2T_uniq_lst=[x[1] for x in RC_C2T_uniq_lst] + + + #---------------------------------------------------------------- + + numbers_premapped_lst[0] += len(Unique_FW_C2T) + numbers_premapped_lst[1] += len(Unique_RC_C2T) + + #---------------------------------------------------------------- + + nn = 0 + gseq = dict() + chr_length = dict() + for ali_unique_lst, ali_dic in [(FW_C2T_uniq_lst,FW_C2T_U),(RC_C2T_uniq_lst,RC_C2T_U)]: + nn += 1 + mapped_chr0 = "" + for header in ali_unique_lst: + _, mapped_chr, mapped_location, cigar = ali_dic[header] + original_BS = original_bs_reads[header] + #------------------------------------- + if mapped_chr not in gseq : + gseq[mapped_chr] = deserialize(db_d(mapped_chr)) + chr_length[mapped_chr] = len(gseq[mapped_chr]) + #if mapped_chr != mapped_chr0: + # my_gseq = deserialize(db_d(mapped_chr)) + # chr_length = len(my_gseq) + # mapped_chr0 = mapped_chr + #------------------------------------- + + r_start, r_end, g_len = get_read_start_end_and_genome_length(cigar) + + all_mapped+=1 + if nn == 1: # +FW mapped to + strand: + FR = "+FW" + mapped_strand = "+" + elif nn == 2: # -FW mapped to - strand: + mapped_strand = "-" + FR = "-FW" + mapped_location = chr_length[mapped_chr] - mapped_location - g_len + + + origin_genome, next, output_genome = get_genomic_sequence(gseq[mapped_chr], mapped_location, mapped_location + g_len, mapped_strand) + r_aln, g_aln = cigar_to_alignment(cigar, original_BS, origin_genome) + + if len(r_aln) == len(g_aln): + N_mismatch = N_MIS(r_aln, g_aln) #+ original_BS_length - (r_end - r_start) # mismatches in the alignment + soft clipped nucleotides + if N_mismatch <= int(max_mismatch_no): + numbers_mapped_lst[nn-1] += 1 + all_mapped_passed += 1 + methy = methy_seq(r_aln, g_aln+next) + mC_lst, uC_lst = mcounts(methy, mC_lst, uC_lst) + + #---XS FILTER---------------- + XS = 0 + nCH = methy.count('y') + methy.count('z') + nmCH = methy.count('Y') + methy.count('Z') + if( (nmCH>XS_count) and nmCH/float(nCH+nmCH)>XS_pct ) : + XS = 1 + + outfile.store(header, N_mismatch, FR, mapped_chr, mapped_strand, mapped_location, cigar, original_BS, methy, XS, output_genome = output_genome) + + #---------------------------------------------------------------- + logm("--> %s (%d) "%(read_file,no_my_files)) + delete_files(WC2T, CC2T) + + + #---------------------------------------------------------------- + +# outf.close() + delete_files(tmp_path) + + logm("Number of raw reads: %d \n"% all_raw_reads) + if all_raw_reads >0: + logm("Number of reads having adapter removed: %d \n" % all_trimed ) + logm("Number of reads rejected because of multiple hits: %d\n" % len(Multiple_hits) ) + logm("Number of unique-hits reads for post-filtering: %d\n" % all_mapped) + if asktag=="Y": + logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) + logm(" ---- %7d RC reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[1]) ) + logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[2]) ) + logm(" ---- %7d RC reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[3]) ) + elif asktag=="N": + logm(" ---- %7d FW reads mapped to Watson strand (before post-filtering)"%(numbers_premapped_lst[0]) ) + logm(" ---- %7d FW reads mapped to Crick strand (before post-filtering)"%(numbers_premapped_lst[1]) ) + + logm("Post-filtering %d uniquely aligned reads with mismatches <= %s"%(all_mapped_passed, max_mismatch_no) ) + if asktag=="Y": + logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) + logm(" ---- %7d RC reads mapped to Watson strand"%(numbers_mapped_lst[1]) ) + logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[2]) ) + logm(" ---- %7d RC reads mapped to Crick strand"%(numbers_mapped_lst[3]) ) + elif asktag=="N": + logm(" ---- %7d FW reads mapped to Watson strand"%(numbers_mapped_lst[0]) ) + logm(" ---- %7d FW reads mapped to Crick strand"%(numbers_mapped_lst[1]) ) + logm("Mappability= %1.4f%%"%(100*float(all_mapped_passed)/all_raw_reads) ) + + n_CG=mC_lst[0]+uC_lst[0] + n_CHG=mC_lst[1]+uC_lst[1] + n_CHH=mC_lst[2]+uC_lst[2] + + logm("----------------------------------------------" ) + logm("Methylated C in mapped reads ") + + logm(" mCG %1.3f%%"%((100*float(mC_lst[0])/n_CG) if n_CG != 0 else 0)) + logm(" mCHG %1.3f%%"%((100*float(mC_lst[1])/n_CHG) if n_CHG != 0 else 0)) + logm(" mCHH %1.3f%%"%((100*float(mC_lst[2])/n_CHH) if n_CHH != 0 else 0)) + + logm("------------------- END --------------------" ) + elapsed("=== END %s ===" % main_read_file) + + +