Mercurial > repos > xuebing > sharplabtool
comparison tools/solid_tools/maq_cs_wrapper.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #Guruprasad Ananda | |
| 3 #MAQ mapper for SOLiD colourspace-reads | |
| 4 | |
| 5 import sys, os, zipfile, tempfile, subprocess | |
| 6 | |
| 7 def stop_err( msg ): | |
| 8 sys.stderr.write( "%s\n" % msg ) | |
| 9 sys.exit() | |
| 10 | |
| 11 def __main__(): | |
| 12 | |
| 13 out_fname = sys.argv[1].strip() | |
| 14 out_f2 = open(sys.argv[2].strip(),'r+') | |
| 15 ref_fname = sys.argv[3].strip() | |
| 16 f3_read_fname = sys.argv[4].strip() | |
| 17 f3_qual_fname = sys.argv[5].strip() | |
| 18 paired = sys.argv[6] | |
| 19 if paired == 'yes': | |
| 20 r3_read_fname = sys.argv[7].strip() | |
| 21 r3_qual_fname = sys.argv[8].strip() | |
| 22 min_mapqual = int(sys.argv[9].strip()) | |
| 23 max_mismatch = int(sys.argv[10].strip()) | |
| 24 out_f3name = sys.argv[11].strip() | |
| 25 subprocess_dict = {} | |
| 26 | |
| 27 ref_csfa = tempfile.NamedTemporaryFile() | |
| 28 ref_bfa = tempfile.NamedTemporaryFile() | |
| 29 ref_csbfa = tempfile.NamedTemporaryFile() | |
| 30 cmd2_1 = 'maq fasta2csfa %s > %s 2>&1' %(ref_fname,ref_csfa.name) | |
| 31 cmd2_2 = 'maq fasta2bfa %s %s 2>&1' %(ref_csfa.name,ref_csbfa.name) | |
| 32 cmd2_3 = 'maq fasta2bfa %s %s 2>&1' %(ref_fname,ref_bfa.name) | |
| 33 try: | |
| 34 os.system(cmd2_1) | |
| 35 os.system(cmd2_2) | |
| 36 os.system(cmd2_3) | |
| 37 except Exception, erf: | |
| 38 stop_err(str(erf)+"Error processing reference sequence") | |
| 39 | |
| 40 if paired == 'yes': #paired end reads | |
| 41 tmpf = tempfile.NamedTemporaryFile() #forward reads | |
| 42 tmpr = tempfile.NamedTemporaryFile() #reverse reads | |
| 43 tmps = tempfile.NamedTemporaryFile() #single reads | |
| 44 tmpffastq = tempfile.NamedTemporaryFile() | |
| 45 tmprfastq = tempfile.NamedTemporaryFile() | |
| 46 tmpsfastq = tempfile.NamedTemporaryFile() | |
| 47 | |
| 48 cmd1 = "solid2fastq_modified.pl 'yes' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,tmpr.name,tmps.name,f3_read_fname,f3_qual_fname,r3_read_fname,r3_qual_fname) | |
| 49 try: | |
| 50 os.system(cmd1) | |
| 51 os.system('gunzip -c %s >> %s' %(tmpf.name,tmpffastq.name)) | |
| 52 os.system('gunzip -c %s >> %s' %(tmpr.name,tmprfastq.name)) | |
| 53 os.system('gunzip -c %s >> %s' %(tmps.name,tmpsfastq.name)) | |
| 54 | |
| 55 except Exception, eq: | |
| 56 stop_err("Error converting data to fastq format." + str(eq)) | |
| 57 | |
| 58 #make a temp directory where the split fastq files will be stored | |
| 59 try: | |
| 60 split_dir = tempfile.mkdtemp() | |
| 61 split_file_prefix_f = tempfile.mktemp(dir=split_dir) | |
| 62 split_file_prefix_r = tempfile.mktemp(dir=split_dir) | |
| 63 splitcmd_f = 'split -a 2 -l %d %s %s' %(32000000,tmpffastq.name,split_file_prefix_f) #32M lines correspond to 8M reads | |
| 64 splitcmd_r = 'split -a 2 -l %d %s %s' %(32000000,tmprfastq.name,split_file_prefix_r) #32M lines correspond to 8M reads | |
| 65 | |
| 66 os.system(splitcmd_f) | |
| 67 os.system(splitcmd_r) | |
| 68 os.chdir(split_dir) | |
| 69 ii = 0 | |
| 70 for fastq in os.listdir(split_dir): | |
| 71 if not fastq.startswith(split_file_prefix_f.split("/")[-1]): | |
| 72 continue | |
| 73 fastq_r = split_file_prefix_r + fastq.split(split_file_prefix_f.split("/")[-1])[1] #find the reverse strand fastq corresponding to formward strand fastq | |
| 74 tmpbfq_f = tempfile.NamedTemporaryFile() | |
| 75 tmpbfq_r = tempfile.NamedTemporaryFile() | |
| 76 cmd3 = 'maq fastq2bfq %s %s 2>&1; maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq_f.name,fastq_r,tmpbfq_r.name,fastq,ref_csbfa.name,tmpbfq_f.name,tmpbfq_r.name,fastq,fastq) | |
| 77 subprocess_dict['sp'+str(ii+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE) | |
| 78 ii += 1 | |
| 79 while True: | |
| 80 all_done = True | |
| 81 for j,k in enumerate(subprocess_dict.keys()): | |
| 82 if subprocess_dict['sp'+str(j+1)].wait() != 0: | |
| 83 err = subprocess_dict['sp'+str(j+1)].communicate()[1] | |
| 84 if err != None: | |
| 85 stop_err("Mapping error: %s" %err) | |
| 86 all_done = False | |
| 87 if all_done: | |
| 88 break | |
| 89 cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname) | |
| 90 os.system(cmdout) | |
| 91 | |
| 92 tmpcsmap = tempfile.NamedTemporaryFile() | |
| 93 cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name) | |
| 94 os.system(cmd_cat_csmap) | |
| 95 | |
| 96 tmppileup = tempfile.NamedTemporaryFile() | |
| 97 cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name) | |
| 98 os.system(cmdpileup) | |
| 99 tmppileup.seek(0) | |
| 100 print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count" | |
| 101 for line in file(tmppileup.name): | |
| 102 elems = line.strip().split() | |
| 103 ref_nt = elems[2].capitalize() | |
| 104 read_nt = elems[4] | |
| 105 coverage = int(elems[3]) | |
| 106 a,t,g,c = 0,0,0,0 | |
| 107 ref_nt_count = 0 | |
| 108 for ch in read_nt: | |
| 109 ch = ch.capitalize() | |
| 110 if ch not in ['A','T','G','C',',','.']: | |
| 111 continue | |
| 112 if ch in [',','.']: | |
| 113 ch = ref_nt | |
| 114 ref_nt_count += 1 | |
| 115 try: | |
| 116 nt_ind = ['A','T','G','C'].index(ch) | |
| 117 if nt_ind == 0: | |
| 118 a+=1 | |
| 119 elif nt_ind == 1: | |
| 120 t+=1 | |
| 121 elif nt_ind == 2: | |
| 122 g+=1 | |
| 123 else: | |
| 124 c+=1 | |
| 125 except ValueError, we: | |
| 126 print >>sys.stderr, we | |
| 127 print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c) | |
| 128 except Exception, er2: | |
| 129 stop_err("Encountered error while mapping: %s" %(str(er2))) | |
| 130 | |
| 131 | |
| 132 else: #single end reads | |
| 133 tmpf = tempfile.NamedTemporaryFile() | |
| 134 tmpfastq = tempfile.NamedTemporaryFile() | |
| 135 cmd1 = "solid2fastq_modified.pl 'no' %s %s %s %s %s %s %s 2>&1" %(tmpf.name,None,None,f3_read_fname,f3_qual_fname,None,None) | |
| 136 try: | |
| 137 os.system(cmd1) | |
| 138 os.system('gunzip -c %s >> %s' %(tmpf.name,tmpfastq.name)) | |
| 139 tmpf.close() | |
| 140 except: | |
| 141 stop_err("Error converting data to fastq format.") | |
| 142 | |
| 143 #make a temp directory where the split fastq files will be stored | |
| 144 try: | |
| 145 split_dir = tempfile.mkdtemp() | |
| 146 split_file_prefix = tempfile.mktemp(dir=split_dir) | |
| 147 splitcmd = 'split -a 2 -l %d %s %s' %(32000000,tmpfastq.name,split_file_prefix) #32M lines correspond to 8M reads | |
| 148 os.system(splitcmd) | |
| 149 os.chdir(split_dir) | |
| 150 for i,fastq in enumerate(os.listdir(split_dir)): | |
| 151 tmpbfq = tempfile.NamedTemporaryFile() | |
| 152 cmd3 = 'maq fastq2bfq %s %s 2>&1; maq map -c %s.csmap %s %s 1>/dev/null 2>&1; maq mapview %s.csmap > %s.txt' %(fastq,tmpbfq.name,fastq,ref_csbfa.name,tmpbfq.name,fastq,fastq) | |
| 153 subprocess_dict['sp'+str(i+1)] = subprocess.Popen([cmd3],shell=True,stdout=subprocess.PIPE) | |
| 154 | |
| 155 while True: | |
| 156 all_done = True | |
| 157 for j,k in enumerate(subprocess_dict.keys()): | |
| 158 if subprocess_dict['sp'+str(j+1)].wait() != 0: | |
| 159 err = subprocess_dict['sp'+str(j+1)].communicate()[1] | |
| 160 if err != None: | |
| 161 stop_err("Mapping error: %s" %err) | |
| 162 all_done = False | |
| 163 if all_done: | |
| 164 break | |
| 165 | |
| 166 cmdout = "for map in *.txt; do cat $map >> %s; done" %(out_fname) | |
| 167 os.system(cmdout) | |
| 168 | |
| 169 tmpcsmap = tempfile.NamedTemporaryFile() | |
| 170 cmd_cat_csmap = "for csmap in *.csmap; do cat $csmap >> %s; done" %(tmpcsmap.name) | |
| 171 os.system(cmd_cat_csmap) | |
| 172 | |
| 173 tmppileup = tempfile.NamedTemporaryFile() | |
| 174 cmdpileup = "maq pileup -m %s -q %s %s %s > %s" %(max_mismatch,min_mapqual,ref_bfa.name,tmpcsmap.name,tmppileup.name) | |
| 175 os.system(cmdpileup) | |
| 176 tmppileup.seek(0) | |
| 177 print >> out_f2, "#chr\tposition\tref_nt\tcoverage\tSNP_count\tA_count\tT_count\tG_count\tC_count" | |
| 178 for line in file(tmppileup.name): | |
| 179 elems = line.strip().split() | |
| 180 ref_nt = elems[2].capitalize() | |
| 181 read_nt = elems[4] | |
| 182 coverage = int(elems[3]) | |
| 183 a,t,g,c = 0,0,0,0 | |
| 184 ref_nt_count = 0 | |
| 185 for ch in read_nt: | |
| 186 ch = ch.capitalize() | |
| 187 if ch not in ['A','T','G','C',',','.']: | |
| 188 continue | |
| 189 if ch in [',','.']: | |
| 190 ch = ref_nt | |
| 191 ref_nt_count += 1 | |
| 192 try: | |
| 193 nt_ind = ['A','T','G','C'].index(ch) | |
| 194 if nt_ind == 0: | |
| 195 a+=1 | |
| 196 elif nt_ind == 1: | |
| 197 t+=1 | |
| 198 elif nt_ind == 2: | |
| 199 g+=1 | |
| 200 else: | |
| 201 c+=1 | |
| 202 except: | |
| 203 pass | |
| 204 print >> out_f2, "%s\t%s\t%s\t%s\t%s\t%s" %("\t".join(elems[:4]),coverage-ref_nt_count,a,t,g,c) | |
| 205 except Exception, er2: | |
| 206 stop_err("Encountered error while mapping: %s" %(str(er2))) | |
| 207 | |
| 208 #Build custom track from pileup | |
| 209 chr_list=[] | |
| 210 out_f2.seek(0) | |
| 211 fcov = tempfile.NamedTemporaryFile() | |
| 212 fout_a = tempfile.NamedTemporaryFile() | |
| 213 fout_t = tempfile.NamedTemporaryFile() | |
| 214 fout_g = tempfile.NamedTemporaryFile() | |
| 215 fout_c = tempfile.NamedTemporaryFile() | |
| 216 fcov.write('''track type=wiggle_0 name="Coverage track" description="Coverage track (from Galaxy)" color=0,0,0 visibility=2\n''') | |
| 217 fout_a.write('''track type=wiggle_0 name="Track A" description="Track A (from Galaxy)" color=255,0,0 visibility=2\n''') | |
| 218 fout_t.write('''track type=wiggle_0 name="Track T" description="Track T (from Galaxy)" color=0,255,0 visibility=2\n''') | |
| 219 fout_g.write('''track type=wiggle_0 name="Track G" description="Track G (from Galaxy)" color=0,0,255 visibility=2\n''') | |
| 220 fout_c.write('''track type=wiggle_0 name="Track C" description="Track C (from Galaxy)" color=255,0,255 visibility=2\n''') | |
| 221 | |
| 222 for line in out_f2: | |
| 223 if line.startswith("#"): | |
| 224 continue | |
| 225 elems = line.split() | |
| 226 chr = elems[0] | |
| 227 | |
| 228 if chr not in chr_list: | |
| 229 chr_list.append(chr) | |
| 230 if not (chr.startswith('chr') or chr.startswith('scaffold')): | |
| 231 chr = 'chr' | |
| 232 header = "variableStep chrom=%s" %(chr) | |
| 233 fcov.write("%s\n" %(header)) | |
| 234 fout_a.write("%s\n" %(header)) | |
| 235 fout_t.write("%s\n" %(header)) | |
| 236 fout_g.write("%s\n" %(header)) | |
| 237 fout_c.write("%s\n" %(header)) | |
| 238 try: | |
| 239 pos = int(elems[1]) | |
| 240 cov = int(elems[3]) | |
| 241 a = int(elems[5]) | |
| 242 t = int(elems[6]) | |
| 243 g = int(elems[7]) | |
| 244 c = int(elems[8]) | |
| 245 except: | |
| 246 continue | |
| 247 fcov.write("%s\t%s\n" %(pos,cov)) | |
| 248 try: | |
| 249 a_freq = a*100./cov | |
| 250 t_freq = t*100./cov | |
| 251 g_freq = g*100./cov | |
| 252 c_freq = c*100./cov | |
| 253 except ZeroDivisionError: | |
| 254 a_freq=t_freq=g_freq=c_freq=0 | |
| 255 fout_a.write("%s\t%s\n" %(pos,a_freq)) | |
| 256 fout_t.write("%s\t%s\n" %(pos,t_freq)) | |
| 257 fout_g.write("%s\t%s\n" %(pos,g_freq)) | |
| 258 fout_c.write("%s\t%s\n" %(pos,c_freq)) | |
| 259 | |
| 260 fcov.seek(0) | |
| 261 fout_a.seek(0) | |
| 262 fout_g.seek(0) | |
| 263 fout_t.seek(0) | |
| 264 fout_c.seek(0) | |
| 265 os.system("cat %s %s %s %s %s | cat > %s" %(fcov.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_f3name)) | |
| 266 | |
| 267 if __name__=="__main__": | |
| 268 __main__() | |
| 269 | |
| 270 |
