changeset 0:d1b35bcdaacc draft

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 10:49:05 -0400
parents
children 3d9544198582
files substitution_rates.py substitution_rates.xml test-data/Interval2Maf_pairwise_out.maf test-data/subRates1.out
diffstat 4 files changed, 246 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/substitution_rates.py	Tue Apr 01 10:49:05 2014 -0400
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+#guruprasad Ananda
+"""
+Estimates substitution rates from pairwise alignments using JC69 model.
+"""
+
+from galaxy import eggs
+from galaxy.tools.util.galaxyops import *
+from galaxy.tools.util import maf_utilities
+import bx.align.maf
+import fileinput
+import sys
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+
+if len(sys.argv) < 3:
+    stop_err("Incorrect number of arguments.")
+
+inp_file = sys.argv[1]
+out_file = sys.argv[2]
+fout = open(out_file, 'w')
+int_file = sys.argv[3]
+if int_file != "None":     #The user has specified an interval file
+    dbkey_i = sys.argv[4]
+    chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
+
+
+def rateEstimator(block):
+    global alignlen, mismatches
+
+    src1 = block.components[0].src
+    sequence1 = block.components[0].text
+    start1 = block.components[0].start
+    end1 = block.components[0].end
+    len1 = int(end1)-int(start1)
+    len1_withgap = len(sequence1)
+    mismatch = 0.0
+    
+    for seq in range (1, len(block.components)):
+        src2 = block.components[seq].src
+        sequence2 = block.components[seq].text
+        start2 = block.components[seq].start
+        end2 = block.components[seq].end
+        len2 = int(end2)-int(start2)
+        for nt in range(len1_withgap):
+            if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?':  # Not a gap or masked character
+                if sequence1[nt].upper() != sequence2[nt].upper():
+                    mismatch += 1
+    
+    if int_file == "None":
+        p = mismatch/min(len1, len2)
+        print >> fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" % ( src1, start1, end1, src2, start2, end2, min(len1, len2), mismatch, p )
+    else:
+        mismatches += mismatch
+        alignlen += min(len1, len2)
+
+
+def main():
+    skipped = 0
+    not_pairwise = 0
+    
+    if int_file == "None":
+        try:
+            maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
+        except:
+            stop_err("Your MAF file appears to be malformed.")
+        print >> fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp"
+        for block in maf_reader:
+            if len(block.components) != 2:
+                not_pairwise += 1
+                continue
+            try:
+                rateEstimator(block)
+            except:
+                skipped += 1
+    else:
+        index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] )
+        if index is None:
+            print >> sys.stderr, "Your MAF file appears to be malformed."
+            sys.exit()
+        win = NiceReaderWrapper( fileinput.FileInput( int_file ),
+                                chrom_col=chr_col_i,
+                                start_col=start_col_i,
+                                end_col=end_col_i,
+                                strand_col=strand_col_i,
+                                fix_strand=True)
+        species = None
+        mincols = 0
+        global alignlen, mismatches
+        
+        for interval in win:
+            alignlen = 0
+            mismatches = 0.0
+            src = "%s.%s" % ( dbkey_i, interval.chrom )
+            for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ):
+                if len(block.components) != 2:
+                    not_pairwise += 1
+                    continue
+                try:
+                    rateEstimator(block)
+                except:
+                    skipped += 1
+            if alignlen:
+                p = mismatches/alignlen
+            else:
+                p = 'NA'
+            interval.fields.append(str(alignlen))
+            interval.fields.append(str(mismatches))
+            interval.fields.append(str(p))
+            print >> fout, "\t".join(interval.fields)
+            #num_blocks += 1
+    
+    if not_pairwise:
+        print "Skipped %d non-pairwise blocks" % (not_pairwise)
+    if skipped:
+        print "Skipped %d blocks as invalid" % (skipped)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/substitution_rates.xml	Tue Apr 01 10:49:05 2014 -0400
@@ -0,0 +1,61 @@
+<tool id="subRate1" name="Estimate substitution rates " version="1.0.0">
+  <description> for non-coding regions</description>
+  <command interpreter="python">
+  	substitution_rates.py 
+  	$input 
+  	$out_file1
+  	#if $region.type == "win":
+      ${region.input2} ${region.input2.dbkey} ${region.input2.metadata.chromCol},$region.input2.metadata.startCol,$region.input2.metadata.endCol,$region.input2.metadata.strandCol
+    #else:
+      "None"
+    #end if 
+  </command>
+  <inputs>
+    <param format="maf" name="input" type="data" label="Select pair-wise alignment data"/>
+    <conditional name="region">
+	      <param name="type" type="select" label="Estimate rates corresponding to" multiple="false">
+	         <option value="align">Alignment block</option>
+	         <option value="win">Intervals in your history</option>
+	     </param>
+	     <when value="win">
+	      	<param format="interval" name="input2" type="data" label="Choose intervals">
+	      		<validator type="unspecified_build" />
+	    	</param>
+	      </when>
+	      <when value="align" />
+      </conditional>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" metadata_source="input"/>
+  </outputs>
+  
+  <tests>
+    <test>
+      <param name="input" value="Interval2Maf_pairwise_out.maf"/>
+      <param name="type" value="align"/>
+      <output name="out_file1" file="subRates1.out"/>
+    </test>
+  </tests>
+  
+ <help> 
+
+.. class:: infomark
+
+**What it does**
+
+This tool takes a pairwise MAF file as input and estimates substitution rate according to Jukes-Cantor JC69 model. The 3 new columns appended to the output are explained below:
+
+- L: number of nucleotides compared
+- N: number of different nucleotides
+- p = N/L
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Any block/s not containing exactly two sequences, will be omitted. 
+
+  </help>  
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Interval2Maf_pairwise_out.maf	Tue Apr 01 10:49:05 2014 -0400
@@ -0,0 +1,49 @@
+##maf version=1
+a score=10458
+s hg17.chrX   3816458 525 - 154824264 tGATGCACGTTTTCTCTTTTCATACTCATCTT--CAGTTATTCCTTGACACAAACCTGGTTA---CTAGTGCTATTGGCTACATGGCCCCTGTGCCTTAACTTCCCTCATGGCTCCTCCTCCCTGCAGCCTCTGAAGGTTCTCTGTGTAATGAATT--AATTAGCTTTGCTTTTTTCTCCCTTTCCCCC-----CCTCTTTTTCCTGGCCTCT---------AGAAGAAAACACCAGCAGCCCCAGCAAAGAAAACCAGCACTACCTTCAACATCGTGGGGACCACCTATCCCATCAACCTGGCCAAGGACACTGAATTTTCCACCATCTCCAAGGGCGCTGCTCCCAGTGCCTCCTCAACCCCAACAATCATTGCTTCACCCAAGGCCACCTACGTGCAGGACAGCCCGACTGAGACCAAGA---CCTACAACAGTGTCAGCAAGGTTGACAAAATTTCCCGCATCATCTTTCCTGTGCTCTTTGCCATATTCAATCTGGTCTATTGGGCCACATATGTCAACCGGGAGTCAGCTATCAAGGGCATGA 
+s fr1.chrUn 343715247 529 - 349519338 TGGTCAACATTTTTTTCTTTTCTACTTTTCTTTTCAGGAACCGTTTCAGGCTAACATGGCCAGGATTAAAGATAT-GGTTAGGCTTTTGTTTTGTTCTTGTGTGTGTCGTGTCTCGTGGT----GTTGCTTCTATTTCCTCTTCATGAGTTCAAATGGAAGTATTGTTAAGTTTGTTGCCCCCGACCCTAACGACCTCTGCTTTATGGCCTCTTCTCTTTTCAGAGGAGAGAATCTGAGAATCTGACCAAAAAGACGAACAACACCTTCAACATCGTCGGGACGACCTACGCCATCAACGTGGCGAAGGACCAGGGCTTAACCACCATCTCCAAAAGCGCCACCACCGGCGCGCC----GGCCAAACAGCACTACCTGCACAAAA-----------TGGACGACCCCTACACAGAGGCCAAGAAATCCTACAACCGAGTCAGCATAGTGGACAAAATATCACGCATCATTTTCCCAGTTCTGTTCTTCATCTTCAACTTGGGCTACTGGGCCACATATATCAACAGGAAGCCCATTATTATGAACGCGA 
+
+a score=12779
+s hg17.chrX   3795168 357 - 154824264 aGGGCCATTTTGAGTGGGGTAAAGCTTCTCCTTTCAAATAGGAAA----TTAGTGATTTCATCG--AATAAGC-------TCTCAATGTCTTCCAGTTGCAACTGGTTCTCATCTTTCCTCT----CCTCAGGTGTCACCACTGTGCTTACCATGACCACCTTGAGTATCAGTGCCAGAAATTCCTTACCTAAAGTGGCATATGCGACGGCCATGGACTGGTTCATAGCCGTCTGTTATGCCTTTGTATTTTCTGCACTGATTGAATTTGCCACTGTCAACTATTTCACCAAGCGGAGTTGGGCTTGGGAAGGCAAGAAGGTGCCAGAGGCCCTGGAGATGAAGGTGAGATA----TAGCAAAAGAGAAACTGAGAAG 
+s fr1.chrUn 343710815 364 - 349519338 TGAGCCAATTT--------TGAAGTGACAATTTTCAAATGTGCCACATTTTAGTAGCAGCAAAGTCAGAAAGCGACTTTTCCTCTACATCACTTCTCTGATCTGTGTTCTGATGTTTTCCTTTCATCCTCAGGGGTCACCACTGTGCTTACCATGACCACCCTGAGCATCAGTGCTAGAAACTCCCTCCCTAAGGTGGCCTACGCCACCGCCATGGACTGGTTCATGGCGGTGTGCTACGCCTTTGTCTTCTCTGCCTTGATTGAGTTTGCGACGGTCAACTACTTCACCAAACGCAGCTGGGCATGGGATGGGAAAAA------AGAGGGCATGGAAATAAGGGTGAGTGATTTTTAATCAAATATATACTGATACG 
+
+a score=10363
+s hg17.chrX   3787425 174 - 154824264 TACTCTCTCTTTAGGAGAATATGTCGTCATGACAACCCACTTCCATCTCAAGCGAAAAATTGGCTACTTTGTGATCCAGACCTACTTGCCATGTATCATGACTGTCATTCTGTCACAAGTGTCGTTCTGGCTCAACAGAGAGTCTGTTCCTGCCCGTACAGTCTTTGGTGAGTG 
+s fr1.chrUn 343708230 174 - 349519338 TCCTCCCGCTGCAGGCGAATATGTTGTGATGACAACATATTTCCATTTGAAAAGGAAAATTGGCTATTTCGTGATTCAAACCTACCTCCCGTGCATCATGACTGTCATTCTGTCTCAAGTCTCCTTCTGGCTGAACCGAGAATCGGTCCCTGCTCGCACCGTGTTCGGTGAGTG 
+
+a score=31
+s hg17.chrX  3787284 100 - 154824264 ATTTTGTGATTATTAAAGGGCTCTAGTTATATGAACTTGGGAGATGCTGGAATATTGAATATAAG----ATTAAAGAGCTTTGTGAAT--------TGAATT-----GGGGCCCAAC 
+s fr1.chrUn 62078707 109 - 349519338 AGTTTGTGAGGCCTAAATGCCAAAAGT-------ACTTGA-ACAGGCTTAAGAATGGATCATGAATTTTCTTAAATAAATCTGTTAATCCAGAAAGTAAACTCTGGGGGGGTCCAGA 
+
+a score=4963
+s hg17.chrX   3776942 285 - 154824264 aCTTCAGTTGAAGACAGACCACATACCCAGAACTCTTTGAAGATGGAAGTTTTATTTCGATCTCATTTGAGTTTAGTATCAAGGGTAATCATTCTCCATTTCCTTCAACTTCTCTTTATTGACCCTTCTCTGCC--TTAGATGCCTATACAACAGCTGAAGTGGTTTATTCTTGGACTCTCGGAAAGAACAAATCCGTGGAAGTGGCACAGGATGGTTCTCGCTTGAACCAGTATGACCTTTTGGGCCATGTTGTTGGGACAGAGATAATCCGGTCTAGTACAGGTA 
+s fr1.chrUn 343707053 283 - 349519338 AACTTAAATCAACACCAAGTAAAGGCAAAGAATCTTTTAAACTAAGAA----TCTATTAAACTAGATAAACTAAAGCATCGACACTGACGCGTTGATCTGTCATCCACGTTTCCCCTGTTTCCTTTCCTTCTCCATCTAGACGCTTACACCAACAATGAGGTGATTTACCTGTGGACTCAGGGGGACGAGAGGTCTGTGTCTGTCGCAGAGGACGGCTCCAGGCTGAACCAGTATGATTTACTGGGACACGTTATTGGCAAAGAAACCATCAGCTCCAGCACAGGTA 
+
+a score=5912
+s hg17.chrX   3760375 93 - 154824264 TTAGGTTAACAATTCATGCTGAGTGTCCCATGCATTTGGAAGATTTTCCCATGGATGTGCATGCCTGCCCACTGAAGTTTGGAAGCTGTATGT 
+s fr1.chrUn 343706399 93 - 349519338 TCAGGTTGACCATCCACGCTGAGTGCCCCATGCACCTGGAGGACTTCCCCATGGACTCGCACGCCTGTCCTCTCAAGTTTGGAAGTTGTAAGT 
+
+a score=4332
+s hg17.chrX   3733405 476 - 154824264 ATCAAGGAGAATTTGGGGAAACAGAGCATTAGGGAAGGAATAGATGGAAGAGAGGCATGCTTGTAATAAATTATAGGAAAGAGAAGAGTTTGAAAAAGAGATAAGAGTTCACACAATTGTGGAAAGAATGGAGTGGAGTATATGTAAACAGAAAAGAAGCTGTATGGAATGAGGTGGATGGAGGGACAGAATAATGAGGAGGAGAT---GTGGATGGGCAGAAGACAGTAAGCAAATGGAAGAGAAGCAAGA-----GGAAGGAGAGAGTAATGGGCAGGAAAGGATCTGAACCCAGTAGAGATTGCAAATAGTGAGGAGAAATAGAGAATGAATGAAAAGAACAGG--ATAGAGGAGAAACAACAATAAAAGGAATAAAGCATAGAGAATAGGTTTAAGTTTTGGAGGAGGGATAGAGAA-----GAGAGAAGAAAGGAGAGAGGATGGGAATATTGAAATACAAAAAAATAGAAGGTGCAGGTTGAGGA 
+s fr1.chrUn 303515824 444 + 349519338 AACACGGAATATGGAGGGAAAACGGGGAATATGGAGGGAACAG--GGAATATGG-------------------TGGGAACACAGAGGGAATATGGAGGGAACACGGG-------GAATATGGAGGGAA---AACGGGGAATATGGAGGGAACGGGGAATATGGAGGGAACGAGGAATATGGAGGGAACGGGGAATATGGAGGGAACACAGGGAATATGGAGGGAACACAGGGAATATGGAGGGAACACAGGGAATATGGAGGGAACGAGGAATATGGAGGGAACACAGGGAATATGGAGGGAACGAGGAATATGGAGGGAACACAGGGAAT--ATGGAGGGAACGGGGAATATGGAGGGAACACAGGGAATATGGAGGGAACACAGGGAATAT--------------GGAGGGAACGAGGAATATGGAGGGAACACAGGGAATATGGAGGGAACGAGGAATATGGAGGGAACACGGGGTACTCAGTGAGAA 
+
+a score=719
+s hg17.chrX   3731355 108 - 154824264 TATGTAAGTAGAATGCAGTCAAGAAACTTGTAGGTGGATAGGTAGAGGCAGAAGGAAGAGAGGAAGGACGTTCTAGACAGAGAAAACCAAGTGAGCAGTTGCAGGGAA 
+s fr1.chrUn 303515724  91 + 349519338 TATGGAGGGAACACG------GGGAATATGGAGG-GAATATAGAGGGGAGACGGGGAATATGGAGGGA------ACACGGGGAATATGGAGGGAACA----CAGGGAA 
+
+a score=3647
+s hg17.chrX   3730591 447 - 154824264 CAAGGAAAAAAGGATAATACTTCAGAGAGAAGTTGGATGAGAGATGGGAGGAA--AAATGAACATAGAGGAGAAATGAGCATAGAGTGGAGGATAAAATAATGAAGAAGGAAAGTGGATAGAAGTGGGAAGATGAGGGTCAAGAGGGGAGGAGAAGACTAATTACACAGAGTACATAGAGGCATGTATATTGGAGAAATGAGAATGGTGAAAAGAAAGAGAGTATGAGAAGAGAGGACAAAAAGAGTAGAGAAGTTGAGAAGACAAAGGATAAGATGGTGTAAAATAAAGAAGCTAGATTAGATATTAGGATATTCTAAAGAGATAGGCGAATATAATGAAGTGGATGTAGTAGATGGGAATAAAGAGGAAAAGACATAGAGAAGTAGGGGAAGGGACATGGGAGGAGATGAACAAGAGGGAGGGAAATTGTAGACGAAAGAAAGAGAA 
+s fr1.chrUn 303515378 346 + 349519338 CACGGGGAATATGGCGGGAACACAGAGGGAATATGGAGGGAACACGGGGAATATGGAGGGAATATAGAGGGGAGACGGG--GAATATGGAGGG-AACATGGGGAATATGGAGGGAATATAGA---GGGGAGACGGGGAATATGGAGGGA--------------ACACGGGGAATATGGAGG---GAACACAGG------GAATATGGCGGGAACACAGGGAATATG----GAGGGAACACGGGGAATATGGAGG----GAACACAGGGAATATGGAGG----AAATATAGAGGGAAAACGGGAAAT------------------------------ATGGAGGGAATATGGT----GGGAACACAGGGAATA----------------TGGAGGGAACACGGGG----------AATATGGAGGGAA--TATAGAGGGGAGACGGGGAA 
+
+a score=13113
+s hg17.chrX   3729219 238 - 154824264 CCCCTAGGAGTACACTATTGATGTATTTTTTCGGCAGACATGGCATGATGAAAGACTGAAATTTGATGGCCCCATGAAGATCCTTCCACTGAACAATCTCCTGGCTAGTAAGATCTGGACACCGGACACCTTCTTCCACAATGGCAAGAAATCAGTGGCTCATAACATGACCACGCCCAACAAGCTGCTCAGATTGGTGGACAACGGAACCCTCCTCTATACAATGAGGTCAGAGAGT 
+s fr1.chrUn 343703525 238 - 349519338 CCCTCAGGAGTACACCATCGACGTCTTCTTCCGTCAGAGTTGGAGAGACGAGAGGCTAAAGTTTGATGGACCTATGCAGGTTTTACCCCTGAACAACCTCCTGGCCAGTAAGATTTGGACTCCTGACACCTTTTTCCACAACGGAAAGAAGTCTGTTGCCCACAACATGACGACTCCGAACAAACTGCTGCGCCTGGTCGACAACGGCACGCTTCTATATACAATGAGGTGAGGCGGT 
+
+a score=3528
+s hg17.chrX   3700391 307 - 154824264 AGTAAGTTGGCTGTCATTCAGAGT-ACTGCACTG-GACTCT---CTCTGAGCTAAGACAAAAGCCTTTCCTGACATTCTGCTTTTCTACTTTCTTTCACGCAGATGCAGTGACTGAAGTGAAGACTGACATCTACGTGACCAGTTTTGGCCCTGTGTCAGACACTGACATGGTA---------AGTTCTTGCAAAGGTAAATGTA---AAAGAGGATGCACT----AGACCAAAGAAAATAAATTGGAGCTTTCCTAAGGC---AGGGCCTTGTAGCTATATTCTAAGGAGTGGTCTTTTCTCA--TATGAATTT--CCTACCCTCGCCACCCTA 
+s fr1.chrUn 241017738 330 - 349519338 ATTCATTTGTATGTCCTTTTGTGTTACTAAGCTGTGTCTCTAATTCCAGAGCGAAAACAAAACTCATTAGATTTGTGTCATTTGTTCATTTGCTC---CATAGGTCCTGTAACTGAGGTCAAGACAGATATTTATGTGACAAGTTTTGGGCCTGTGTCAGATGTTGAAATGGTACGTACACCGAGTGTTATTGATGCTAAATTTACCTCAGGAGGGGACACTTTTAATACCTTATAGAACATTATAGAGATTCTGGGACGTGCAAGGGCAGTGCAGGAAAGGCC--AGGTGGGGTCAGGTCTTAGTTATGAAGCGGACATACTGTCGCAGCCCTA 
+
+a score=5471
+s hg17.chrX   3639441 205 - 154824264 TCTTGAATTACGATATCTA-GTCATGT---ATTTTTTTTCCTATTAAGAACTATTATTTTTTTTCTTTTC---CTTGCATTGCCTCAGGCTGTCTC---CTAAGCATGCCCCAGATATTCCTGATGACAGCACTGACAACATCACTATCTTCACCAGAATCTTGGATCGTCTTCTGGACGGCTATGACAACCGGCTGCGACCTGGGCTTGGAGGT 
+s fr1.chrUn 333536350 213 + 349519338 TCATTAATCACAAAGACCACGTCATGTTTGACGTCTTGTGGTTTTAAGAAGCGCAAGCCTTGTTCTTTTCACTCTCCCCTTCCTCTCGTCTCTTCCAGGCTCTGCTTGTCCCAGTCTGAGCCGCCTAC--CAACGACAACAGCACAGTTTTCACCAGGATCCTGGATGGGCTCCTTGACGGTTACGACAACAGGCTGCGACCTGGGCTTGGGGGT 
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/subRates1.out	Tue Apr 01 10:49:05 2014 -0400
@@ -0,0 +1,13 @@
+#Seq1	Start1	End1	Seq2	Start2	End2	L	N	p
+hg17.chrX	3816458	3816983	fr1.chrUn	343715247	343715776	525	188	0.3581
+hg17.chrX	3795168	3795525	fr1.chrUn	343710815	343711179	357	92	0.2577
+hg17.chrX	3787425	3787599	fr1.chrUn	343708230	343708404	174	37	0.2126
+hg17.chrX	3787284	3787384	fr1.chrUn	62078707	62078816	100	33	0.3300
+hg17.chrX	3776942	3777227	fr1.chrUn	343707053	343707336	283	122	0.4311
+hg17.chrX	3760375	3760468	fr1.chrUn	343706399	343706492	93	20	0.2151
+hg17.chrX	3733405	3733881	fr1.chrUn	303515824	303516268	444	186	0.4189
+hg17.chrX	3731355	3731463	fr1.chrUn	303515724	303515815	91	36	0.3956
+hg17.chrX	3730591	3731038	fr1.chrUn	303515378	303515724	346	126	0.3642
+hg17.chrX	3729219	3729457	fr1.chrUn	343703525	343703763	238	57	0.2395
+hg17.chrX	3700391	3700698	fr1.chrUn	241017738	241018068	307	112	0.3648
+hg17.chrX	3639441	3639646	fr1.chrUn	333536350	333536563	205	66	0.3220