comparison substitution_rates.py @ 0:d1b35bcdaacc draft

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 10:49:05 -0400
parents
children 3d9544198582
comparison
equal deleted inserted replaced
-1:000000000000 0:d1b35bcdaacc
1 #!/usr/bin/env python
2 #guruprasad Ananda
3 """
4 Estimates substitution rates from pairwise alignments using JC69 model.
5 """
6
7 from galaxy import eggs
8 from galaxy.tools.util.galaxyops import *
9 from galaxy.tools.util import maf_utilities
10 import bx.align.maf
11 import fileinput
12 import sys
13
14 def stop_err(msg):
15 sys.stderr.write(msg)
16 sys.exit()
17
18
19 if len(sys.argv) < 3:
20 stop_err("Incorrect number of arguments.")
21
22 inp_file = sys.argv[1]
23 out_file = sys.argv[2]
24 fout = open(out_file, 'w')
25 int_file = sys.argv[3]
26 if int_file != "None": #The user has specified an interval file
27 dbkey_i = sys.argv[4]
28 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
29
30
31 def rateEstimator(block):
32 global alignlen, mismatches
33
34 src1 = block.components[0].src
35 sequence1 = block.components[0].text
36 start1 = block.components[0].start
37 end1 = block.components[0].end
38 len1 = int(end1)-int(start1)
39 len1_withgap = len(sequence1)
40 mismatch = 0.0
41
42 for seq in range (1, len(block.components)):
43 src2 = block.components[seq].src
44 sequence2 = block.components[seq].text
45 start2 = block.components[seq].start
46 end2 = block.components[seq].end
47 len2 = int(end2)-int(start2)
48 for nt in range(len1_withgap):
49 if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': # Not a gap or masked character
50 if sequence1[nt].upper() != sequence2[nt].upper():
51 mismatch += 1
52
53 if int_file == "None":
54 p = mismatch/min(len1, len2)
55 print >> fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" % ( src1, start1, end1, src2, start2, end2, min(len1, len2), mismatch, p )
56 else:
57 mismatches += mismatch
58 alignlen += min(len1, len2)
59
60
61 def main():
62 skipped = 0
63 not_pairwise = 0
64
65 if int_file == "None":
66 try:
67 maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
68 except:
69 stop_err("Your MAF file appears to be malformed.")
70 print >> fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp"
71 for block in maf_reader:
72 if len(block.components) != 2:
73 not_pairwise += 1
74 continue
75 try:
76 rateEstimator(block)
77 except:
78 skipped += 1
79 else:
80 index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] )
81 if index is None:
82 print >> sys.stderr, "Your MAF file appears to be malformed."
83 sys.exit()
84 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
85 chrom_col=chr_col_i,
86 start_col=start_col_i,
87 end_col=end_col_i,
88 strand_col=strand_col_i,
89 fix_strand=True)
90 species = None
91 mincols = 0
92 global alignlen, mismatches
93
94 for interval in win:
95 alignlen = 0
96 mismatches = 0.0
97 src = "%s.%s" % ( dbkey_i, interval.chrom )
98 for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ):
99 if len(block.components) != 2:
100 not_pairwise += 1
101 continue
102 try:
103 rateEstimator(block)
104 except:
105 skipped += 1
106 if alignlen:
107 p = mismatches/alignlen
108 else:
109 p = 'NA'
110 interval.fields.append(str(alignlen))
111 interval.fields.append(str(mismatches))
112 interval.fields.append(str(p))
113 print >> fout, "\t".join(interval.fields)
114 #num_blocks += 1
115
116 if not_pairwise:
117 print "Skipped %d non-pairwise blocks" % (not_pairwise)
118 if skipped:
119 print "Skipped %d blocks as invalid" % (skipped)
120
121
122 if __name__ == "__main__":
123 main()