annotate tools/regVariation/qv_to_bqv.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 Adapted from bx/scripts/qv_to_bqv.py
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 Convert a qual (qv) file to several BinnedArray files for fast seek.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 This script takes approximately 4 seconds per 1 million base pairs.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 The input format is fasta style quality -- fasta headers followed by
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 whitespace separated integers.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 usage: %prog qual_file output_file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 import pkg_resources
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 pkg_resources.require( "bx-python" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 pkg_resources.require( "numpy" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 import string
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 import psyco_full
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 import sys, re, os, tempfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 from bx.binned_array import BinnedArrayWriter
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 from bx.cookbook import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 import fileinput
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 def load_scores_ba_dir( dir ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 Return a dict-like object (keyed by chromosome) that returns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 FileBinnedArray objects created from "key.ba" files in `dir`
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 return FileBinnedArrayDir( dir )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 args = sys.argv[1:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 qual_file_dir = args[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 #mydir="/home/gua110/Desktop/chimp_quality_scores/chr22.qa"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 mydir="/home/gua110/Desktop/rhesus_quality_scores/rheMac2.qual.qv"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 qual_file_dir = mydir.replace(mydir.split("/")[-1], "")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 output_file = args[ 1 ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 fo = open(output_file,"w")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 print "usage: qual_file output_file"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 tmpfile = tempfile.NamedTemporaryFile()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 cmdline = "ls " + qual_file_dir + "*.qa | cat >> " + tmpfile.name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 os.system (cmdline)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 for qual_file in tmpfile.readlines():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 qual = fileinput.FileInput( qual_file.strip() )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 outfile = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 outbin = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 base_count = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 mega_count = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 for line in qual:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 line = line.rstrip("\r\n")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 if line.startswith(">"):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 # close old
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 if outbin and outfile:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 print "\nFinished region " + region + " at " + str(base_count) + " base pairs."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 outbin.finish()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 outfile.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 # start new file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 region = line.lstrip(">")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 #outfname = output_file + "." + region + ".bqv" #CHANGED
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 outfname = qual_file.strip() + ".bqv"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 print >>fo, "Writing region " + region + " to file " + outfname
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 outfile = open( outfname , "wb")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 outbin = BinnedArrayWriter(outfile, typecode='b', default=0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 base_count = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 mega_count = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 if outfile and outbin:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 nums = line.split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 for val in nums:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 outval = int(val)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 assert outval <= 255 and outval >= 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 outbin.write(outval)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 base_count += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 if (mega_count * 1000000) <= base_count:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 sys.stdout.write(str(mega_count)+" ")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 sys.stdout.flush()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 mega_count = base_count // 1000000 + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 if outbin and outfile:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 print "\nFinished region " + region + " at " + str(base_count) + " base pairs."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 outbin.finish()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 outfile.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 main()