0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import os, sys
|
|
4
|
|
5 assert sys.version_info[:2] >= ( 2, 4 )
|
|
6
|
|
7 def stop_err( msg ):
|
|
8 sys.stderr.write( "%s\n" % msg )
|
|
9 sys.exit()
|
|
10
|
|
11 def reverse_complement(s):
|
|
12 complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."}
|
|
13 reversed_s = []
|
|
14 for i in s:
|
|
15 reversed_s.append(complement_dna[i])
|
|
16 reversed_s.reverse()
|
|
17 return "".join(reversed_s)
|
|
18
|
|
19 def __main__():
|
|
20 nuc_index = {'a':0,'t':1,'c':2,'g':3}
|
|
21 diff_hash = {} # key = (chrom, index)
|
|
22 infile = sys.argv[1]
|
|
23 outfile = sys.argv[2]
|
|
24 invalid_lines = 0
|
|
25 invalid_chars = 0
|
|
26 data_id = ''
|
|
27 data_seq = ''
|
|
28
|
|
29 for i, line in enumerate( open( infile ) ):
|
|
30 line = line.rstrip( '\r\n' )
|
|
31 if not line or line.startswith( '#' ):
|
|
32 continue
|
|
33 fields = line.split()
|
|
34 if len(fields) != 23: # standard number of pslx columns
|
|
35 invalid_lines += 1
|
|
36 continue
|
|
37 if not fields[0].isdigit():
|
|
38 invalid_lines += 1
|
|
39 continue
|
|
40 read_id = fields[9]
|
|
41 chrom = fields[13]
|
|
42 try:
|
|
43 block_count = int(fields[17])
|
|
44 except:
|
|
45 invalid_lines += 1
|
|
46 continue
|
|
47 block_size = fields[18].split(',')
|
|
48 read_start = fields[19].split(',')
|
|
49 chrom_start = fields[20].split(',')
|
|
50 read_seq = fields[21].split(',')
|
|
51 chrom_seq = fields[22].split(',')
|
|
52
|
|
53 for j in range(block_count):
|
|
54 try:
|
|
55 this_block_size = int(block_size[j])
|
|
56 this_read_start = int(read_start[j])
|
|
57 this_chrom_start = int(chrom_start[j])
|
|
58 except:
|
|
59 invalid_lines += 1
|
|
60 break
|
|
61 this_read_seq = read_seq[j]
|
|
62 this_chrom_seq = chrom_seq[j]
|
|
63
|
|
64 if not this_read_seq.isalpha():
|
|
65 continue
|
|
66 if not this_chrom_seq.isalpha():
|
|
67 continue
|
|
68
|
|
69 # brut force to check coverage
|
|
70 for k in range(this_block_size):
|
|
71 cur_index = this_chrom_start+k
|
|
72 sub_a = this_read_seq[k:(k+1)].lower()
|
|
73 sub_b = this_chrom_seq[k:(k+1)].lower()
|
|
74 if not diff_hash.has_key((chrom, cur_index)):
|
|
75 try:
|
|
76 diff_hash[(chrom, cur_index)] = [0,0,0,0,sub_b.upper()] # a, t, c, g, ref. nuc.
|
|
77 except Exception, e:
|
|
78 stop_err( str( e ) )
|
|
79 if sub_a in ['a','t','c','g']:
|
|
80 diff_hash[(chrom, cur_index)][nuc_index[(sub_a)]] += 1
|
|
81 else:
|
|
82 invalid_chars += 1
|
|
83
|
|
84 outputfh = open(outfile, 'w')
|
|
85 outputfh.write( "##title\tlocation\tref.\tcov.\tA\tT\tC\tG\n" )
|
|
86 keys = diff_hash.keys()
|
|
87 keys.sort()
|
|
88 for i in keys:
|
|
89 (chrom, location) = i
|
|
90 sum = diff_hash[ (i) ][ 0 ] + diff_hash[ ( i ) ][ 1 ] + diff_hash[ ( i ) ][ 2 ] + diff_hash[ ( i ) ][ 3 ] # did not include N's
|
|
91 if sum == 0:
|
|
92 continue
|
|
93 ratio_A = diff_hash[ ( i ) ][ 0 ] * 100.0 / sum
|
|
94 ratio_T = diff_hash[ ( i ) ][ 1 ] * 100.0 / sum
|
|
95 ratio_C = diff_hash[ ( i ) ][ 2 ] * 100.0 / sum
|
|
96 ratio_G = diff_hash[ ( i ) ][ 3 ] * 100.0 / sum
|
|
97 (title_head, title_tail) = os.path.split(chrom)
|
|
98 result = "%s\t%s\t%s\t%d\tA(%0.0f)\tT(%0.0f)\tC(%0.0f)\tG(%0.0f)\n" % ( title_tail, location, diff_hash[(i)][4], sum, ratio_A, ratio_T, ratio_C, ratio_G )
|
|
99 outputfh.write(result)
|
|
100 outputfh.close()
|
|
101
|
|
102 if invalid_lines:
|
|
103 print 'Skipped %d invalid lines. ' % ( invalid_lines )
|
|
104 if invalid_chars:
|
|
105 print 'Skipped %d invalid characters in the alignment. ' % (invalid_chars)
|
|
106
|
|
107 if __name__ == '__main__': __main__() |