Mercurial > repos > devteam > mutate_snp_codon

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mutate_snp_codon.py	Wed Sep 25 11:27:51 2013 -0400
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+"""
+Script to mutate SNP codons.
+Dan Blankenberg
+"""
+
+import sys, string
+
+def strandify( fields, column ):
+    strand = '+'
+    if column >= 0 and column < len( fields ):
+        strand = fields[ column ]
+        if strand not in [ '+', '-' ]:
+            strand = '+'
+    return strand
+
+def main():
+    # parse command line
+    input_file = sys.argv[1]
+    out = open( sys.argv[2], 'wb+' )
+    codon_chrom_col = int( sys.argv[3] ) - 1
+    codon_start_col = int( sys.argv[4] ) - 1
+    codon_end_col = int( sys.argv[5] ) - 1
+    codon_strand_col = int( sys.argv[6] ) - 1
+    codon_seq_col = int( sys.argv[7] ) - 1
+
+    snp_chrom_col = int( sys.argv[8] ) - 1
+    snp_start_col = int( sys.argv[9] ) - 1
+    snp_end_col = int( sys.argv[10] ) - 1
+    snp_strand_col = int( sys.argv[11] ) - 1
+    snp_observed_col = int( sys.argv[12] ) - 1
+
+    max_field_index = max( codon_chrom_col, codon_start_col, codon_end_col, codon_strand_col, codon_seq_col, snp_chrom_col, snp_start_col, snp_end_col, snp_strand_col, snp_observed_col )
+
+    DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
+    skipped_lines = 0
+    errors = {}
+    for name, message in [ ('max_field_index','not enough fields'), ( 'codon_len', 'codon length must be 3' ), ( 'codon_seq', 'codon sequence must have length 3' ), ( 'snp_len', 'SNP length must be 3' ), ( 'snp_observed', 'SNP observed values must have length 3' ), ( 'empty_comment', 'empty or comment'), ( 'no_overlap', 'codon and SNP do not overlap' ) ]:
+        errors[ name ] = { 'count':0, 'message':message }
+    line_count = 0
+    for line_count, line in enumerate( open( input_file ) ):
+        line = line.rstrip( '\n\r' )
+        if line and not line.startswith( '#' ):
+            fields = line.split( '\t' )
+            if max_field_index >= len( fields ):
+                skipped_lines += 1
+                errors[ 'max_field_index' ]['count'] += 1
+                continue
+
+            #read codon info
+            codon_chrom = fields[codon_chrom_col]
+            codon_start = int( fields[codon_start_col] )
+            codon_end = int( fields[codon_end_col] )
+            if codon_end - codon_start != 3:
+                #codons must be length 3
+                skipped_lines += 1
+                errors[ 'codon_len' ]['count'] += 1
+                continue
+            codon_strand = strandify( fields, codon_strand_col )
+            codon_seq = fields[codon_seq_col].upper()
+            if len( codon_seq ) != 3:
+                #codon sequence must have length 3
+                skipped_lines += 1
+                errors[ 'codon_seq' ]['count'] += 1
+                continue
+
+            #read snp info
+            snp_chrom = fields[snp_chrom_col]
+            snp_start = int( fields[snp_start_col] )
+            snp_end = int( fields[snp_end_col] )
+            if snp_end - snp_start != 1:
+                #snps must be length 1
+                skipped_lines += 1
+                errors[ 'snp_len' ]['count'] += 1
+                continue
+            snp_strand = strandify( fields, snp_strand_col )
+            snp_observed = fields[snp_observed_col].split( '/' )
+            snp_observed = [ observed for observed in snp_observed if len( observed ) == 1 ]
+            if not snp_observed:
+                #sequence replacements must be length 1
+                skipped_lines += 1
+                errors[ 'snp_observed' ]['count'] += 1
+                continue
+
+            #Determine index of replacement for observed values into codon
+            offset = snp_start - codon_start
+            #Extract DNA on neg strand codons will have positions reversed relative to interval positions; i.e. position 0 == position 2
+            if codon_strand == '-':
+                offset = 2 - offset
+            if offset < 0 or offset > 2: #assert offset >= 0 and offset <= 2, ValueError( 'Impossible offset determined: %s' % offset )
+                #codon and snp do not overlap
+                skipped_lines += 1
+                errors[ 'no_overlap' ]['count'] += 1
+                continue
+
+            for observed in snp_observed:
+                if codon_strand != snp_strand:
+                    #if our SNP is on a different strand than our codon, take complement of provided observed SNP base
+                    observed = observed.translate( DNA_COMP )
+                snp_codon = [ char for char in codon_seq ]
+                snp_codon[offset] = observed.upper()
+                snp_codon = ''.join( snp_codon )
+
+                if codon_seq != snp_codon: #only output when we actually have a different codon
+                    out.write( "%s\t%s\n" % ( line, snp_codon )  )
+        else:
+            skipped_lines += 1
+            errors[ 'empty_comment' ]['count'] += 1
+    if skipped_lines:
+        print "Skipped %i (%4.2f%%) of %i lines; reasons: %s" % ( skipped_lines, ( float( skipped_lines )/float( line_count ) ) * 100, line_count, ', '.join( [ "%s (%i)" % ( error['message'], error['count'] ) for error in errors.itervalues() if error['count'] ] ) )
+
+if __name__ == "__main__": main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mutate_snp_codon.xml	Wed Sep 25 11:27:51 2013 -0400
@@ -0,0 +1,67 @@
+<tool id="mutate_snp_codon_1" name="Mutate Codons" version="1.0.0">
+  <description>with SNPs</description>
+  <command interpreter="python">mutate_snp_codon.py $input1 $output1 ${input1.metadata.chromCol} ${input1.metadata.startCol} ${input1.metadata.endCol} ${input1.metadata.strandCol} $codon_seq_col $snp_chrom_col $snp_start_col $snp_end_col $snp_strand_col $snp_observed_col</command>
+  <inputs>
+    <param name="input1" type="data" format="interval" label="Interval file with joined SNPs" optional="False" help="The interval metadata for this file should be set for the codon positions."/>
+    <param name="codon_seq_col" label="Codon Sequence column" type="data_column" data_ref="input1" />
+    <param name="snp_chrom_col" label="SNP chromosome column" type="data_column" data_ref="input1" />
+    <param name="snp_start_col" label="SNP start column" type="data_column" data_ref="input1" />
+    <param name="snp_end_col" label="SNP end column" type="data_column" data_ref="input1" />
+    <param name="snp_strand_col" label="SNP strand column" type="data_column" data_ref="input1" />
+    <param name="snp_observed_col" label="SNP observed column" type="data_column" data_ref="input1" />
+  </inputs>
+  <outputs>
+    <data name="output1" format="interval" metadata_source="input1"/>
+  </outputs>
+   <tests>
+     <test>
+       <param name="input1" value="mutate_snp_codon_in.interval"/>
+       <param name="codon_seq_col" value="8"/>
+       <param name="snp_chrom_col" value="17"/>
+       <param name="snp_start_col" value="18"/>
+       <param name="snp_end_col" value="19"/>
+       <param name="snp_strand_col" value="22"/>
+       <param name="snp_observed_col" value="25"/>
+       <output name="output1" file="mutate_snp_codon_out.interval" />
+     </test>
+   </tests>
+  <help>
+This tool takes an interval file as input.  This input should contain a set of codon locations and corresponding DNA sequence (such as from the *Extract Genomic DNA* tool) joined to SNP locations with observed values (such as *all fields from selected table* from the snp130 table of hg18 at the UCSC Table browser).  This interval file should have the metadata (chromosome, start, end, strand) set for the columns containing the locations of the codons. The user needs to specify the columns containing the sequence for the codon as well as the genomic positions and observed values (values should be split by '/') for the SNP data as tool input; SNPs positions and sequence substitutes must have a length of exactly 1. Only genomic intervals which yield a different sequence string are output. All sequence characters are converted to uppercase during processing.
+
+  For example, using these settings:
+
+  * **metadata** **chromosome**, **start**, **end** and **strand** set to **1**, **2**, **3** and **6**, respectively
+  * **Codon Sequence column** set to **c8**
+  * **SNP chromosome column** set to **c17**
+  * **SNP start column** set to **c18**
+  * **SNP end column** set to **c19**
+  * **SNP strand column** set to **c22**
+  * **SNP observed column** set to **c25**
+
+  with the following input::
+
+    chr1	58995	58998	NM_001005484	0	+	GAA	GAA	Glu	GAA	1177632	28.96	0	2787607	0.422452662804	585	chr1	58996	58997	rs1638318	0	+	A	A	A/G	genomic	single	by-submitter	0	0	unknown	exact	3
+    chr1	59289	59292	NM_001005484	0	+	TTT	TTT	Phe	TTT	714298	17.57	0	1538990	0.464134269878	585	chr1	59290	59291	rs71245814	0	+	T	T	G/T	genomic	single	unknown	0	0	unknown	exact	3
+    chr1	59313	59316	NM_001005484	0	+	AAG	AAG	Lys	AAG	1295568	31.86	0	2289189	0.565950648898	585	chr1	59315	59316	rs2854682	0	-	G	G	C/T	genomic	single	by-submitter	0	0	unknown	exact	3
+    chr1	59373	59376	NM_001005484	0	+	ACA	ACA	Thr	ACA	614523	15.11	0	2162384	0.284187729839	585	chr1	59373	59374	rs2691305	0	-	A	A	C/T	genomic	single	unknown	0	0	unknown	exact	3
+    chr1	59412	59415	NM_001005484	0	+	GCG	GCG	Ala	GCG	299495	7.37	0	2820741	0.106176001271	585	chr1	59414	59415	rs2531266	0	+	G	G	C/G	genomic	single	by-submitter	0	0	unknown	exact	3
+    chr1	59412	59415	NM_001005484	0	+	GCG	GCG	Ala	GCG	299495	7.37	0	2820741	0.106176001271	585	chr1	59414	59415	rs55874132	0	+	G	G	C/G	genomic	single	unknown	0	0	coding-synon	exact	1
+
+
+  will produce::
+
+    chr1	58995	58998	NM_001005484	0	+	GAA	GAA	Glu	GAA	1177632	28.96	0	2787607	0.422452662804	585	chr1	58996	58997	rs1638318	0	+	A	A	A/G	genomic	single	by-submitter	0	0	unknown	exact	3	GGA
+    chr1	59289	59292	NM_001005484	0	+	TTT	TTT	Phe	TTT	714298	17.57	0	1538990	0.464134269878	585	chr1	59290	59291	rs71245814	0	+	T	T	G/T	genomic	single	unknown	0	0	unknown	exact	3	TGT
+    chr1	59313	59316	NM_001005484	0	+	AAG	AAG	Lys	AAG	1295568	31.86	0	2289189	0.565950648898	585	chr1	59315	59316	rs2854682	0	-	G	G	C/T	genomic	single	by-submitter	0	0	unknown	exact	3	AAA
+    chr1	59373	59376	NM_001005484	0	+	ACA	ACA	Thr	ACA	614523	15.11	0	2162384	0.284187729839	585	chr1	59373	59374	rs2691305	0	-	A	A	C/T	genomic	single	unknown	0	0	unknown	exact	3	GCA
+    chr1	59412	59415	NM_001005484	0	+	GCG	GCG	Ala	GCG	299495	7.37	0	2820741	0.106176001271	585	chr1	59414	59415	rs2531266	0	+	G	G	C/G	genomic	single	by-submitter	0	0	unknown	exact	3	GCC
+    chr1	59412	59415	NM_001005484	0	+	GCG	GCG	Ala	GCG	299495	7.37	0	2820741	0.106176001271	585	chr1	59414	59415	rs55874132	0	+	G	G	C/G	genomic	single	unknown	0	0	coding-synon	exact	1	GCC
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Taylor J, Nekrutenko A; The Galaxy Team. Making whole genome multiple alignments usable for biologists. Bioinformatics. 2011 Sep 1;27(17):2426-2428. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21775304&gt;`_
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mutate_snp_codon_in.interval	Wed Sep 25 11:27:51 2013 -0400
@@ -0,0 +1,6 @@
+chr1	58995	58998	NM_001005484	0	+	GAA	GAA	Glu	GAA	1177632	28.96	0	2787607	0.422452662804	585	chr1	58996	58997	rs1638318	0	+	A	A	A/G	genomic	single	by-submitter	0	0	unknown	exact	3
+chr1	59289	59292	NM_001005484	0	+	TTT	TTT	Phe	TTT	714298	17.57	0	1538990	0.464134269878	585	chr1	59290	59291	rs71245814	0	+	T	T	G/T	genomic	single	unknown	0	0	unknown	exact	3
+chr1	59313	59316	NM_001005484	0	+	AAG	AAG	Lys	AAG	1295568	31.86	0	2289189	0.565950648898	585	chr1	59315	59316	rs2854682	0	-	G	G	C/T	genomic	single	by-submitter	0	0	unknown	exact	3
+chr1	59373	59376	NM_001005484	0	+	ACA	ACA	Thr	ACA	614523	15.11	0	2162384	0.284187729839	585	chr1	59373	59374	rs2691305	0	-	A	A	C/T	genomic	single	unknown	0	0	unknown	exact	3
+chr1	59412	59415	NM_001005484	0	+	GCG	GCG	Ala	GCG	299495	7.37	0	2820741	0.106176001271	585	chr1	59414	59415	rs2531266	0	+	G	G	C/G	genomic	single	by-submitter	0	0	unknown	exact	3
+chr1	59412	59415	NM_001005484	0	+	GCG	GCG	Ala	GCG	299495	7.37	0	2820741	0.106176001271	585	chr1	59414	59415	rs55874132	0	+	G	G	C/G	genomic	single	unknown	0	0	coding-synon	exact	1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mutate_snp_codon_out.interval	Wed Sep 25 11:27:51 2013 -0400
@@ -0,0 +1,6 @@
+chr1	58995	58998	NM_001005484	0	+	GAA	GAA	Glu	GAA	1177632	28.96	0	2787607	0.422452662804	585	chr1	58996	58997	rs1638318	0	+	A	A	A/G	genomic	single	by-submitter	0	0	unknown	exact	3	GGA
+chr1	59289	59292	NM_001005484	0	+	TTT	TTT	Phe	TTT	714298	17.57	0	1538990	0.464134269878	585	chr1	59290	59291	rs71245814	0	+	T	T	G/T	genomic	single	unknown	0	0	unknown	exact	3	TGT
+chr1	59313	59316	NM_001005484	0	+	AAG	AAG	Lys	AAG	1295568	31.86	0	2289189	0.565950648898	585	chr1	59315	59316	rs2854682	0	-	G	G	C/T	genomic	single	by-submitter	0	0	unknown	exact	3	AAA
+chr1	59373	59376	NM_001005484	0	+	ACA	ACA	Thr	ACA	614523	15.11	0	2162384	0.284187729839	585	chr1	59373	59374	rs2691305	0	-	A	A	C/T	genomic	single	unknown	0	0	unknown	exact	3	GCA
+chr1	59412	59415	NM_001005484	0	+	GCG	GCG	Ala	GCG	299495	7.37	0	2820741	0.106176001271	585	chr1	59414	59415	rs2531266	0	+	G	G	C/G	genomic	single	by-submitter	0	0	unknown	exact	3	GCC
+chr1	59412	59415	NM_001005484	0	+	GCG	GCG	Ala	GCG	299495	7.37	0	2820741	0.106176001271	585	chr1	59414	59415	rs55874132	0	+	G	G	C/G	genomic	single	unknown	0	0	coding-synon	exact	1	GCC