Mercurial > repos > devteam > mapping_to_ucsc
changeset 0:601abbd22cea draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 12:33:55 -0400 |
parents | |
children | 5385aceef9e9 |
files | mapping_to_ucsc.py mapping_to_ucsc.xml |
diffstat | 2 files changed, 405 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mapping_to_ucsc.py Mon May 19 12:33:55 2014 -0400 @@ -0,0 +1,203 @@ +#!/usr/bin/env python + +import sys, tempfile, os + +assert sys.version_info[:2] >= (2.4) + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def main(): + + out_fname = sys.argv[1] + in_fname = sys.argv[2] + chr_col = int(sys.argv[3])-1 + coord_col = int(sys.argv[4])-1 + track_type = sys.argv[5] + if track_type == 'coverage' or track_type == 'both': + coverage_col = int(sys.argv[6])-1 + cname = sys.argv[7] + cdescription = sys.argv[8] + ccolor = sys.argv[9].replace('-',',') + cvisibility = sys.argv[10] + if track_type == 'snp' or track_type == 'both': + if track_type == 'both': + j = 5 + else: + j = 0 + #sname = sys.argv[7+j] + sdescription = sys.argv[6+j] + svisibility = sys.argv[7+j] + #ref_col = int(sys.argv[10+j])-1 + read_col = int(sys.argv[8+j])-1 + + + # Sort the input file based on chromosome (alphabetically) and start co-ordinates (numerically) + sorted_infile = tempfile.NamedTemporaryFile() + try: + os.system("sort -k %d,%d -k %dn -o %s %s" %(chr_col+1,chr_col+1,coord_col+1,sorted_infile.name,in_fname)) + except Exception, exc: + stop_err( 'Initialization error -> %s' %str(exc) ) + + #generate chr list + sorted_infile.seek(0) + chr_vals = [] + for line in file( sorted_infile.name ): + line = line.strip() + if not(line): + continue + try: + fields = line.split('\t') + chr = fields[chr_col] + if chr not in chr_vals: + chr_vals.append(chr) + except: + pass + if not(chr_vals): + stop_err("Skipped all lines as invalid.") + + if track_type == 'coverage' or track_type == 'both': + if track_type == 'coverage': + fout = open( out_fname, "w" ) + else: + fout = tempfile.NamedTemporaryFile() + fout.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( cname, cdescription, ccolor, cvisibility )) + if track_type == 'snp' or track_type == 'both': + fout_a = tempfile.NamedTemporaryFile() + fout_t = tempfile.NamedTemporaryFile() + fout_g = tempfile.NamedTemporaryFile() + fout_c = tempfile.NamedTemporaryFile() + fout_ref = tempfile.NamedTemporaryFile() + + fout_a.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track A", sdescription, '255,0,0', svisibility )) + fout_t.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track T", sdescription, '0,255,0', svisibility )) + fout_g.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track G", sdescription, '0,0,255', svisibility )) + fout_c.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track C", sdescription, '255,0,255', svisibility )) + + + sorted_infile.seek(0) + for line in file( sorted_infile.name ): + line = line.strip() + if not(line): + continue + try: + fields = line.split('\t') + chr = fields[chr_col] + start = int(fields[coord_col]) + assert start > 0 + except: + continue + try: + ind = chr_vals.index(chr) #encountered chr for the 1st time + del chr_vals[ind] + prev_start = '' + header = "variableStep chrom=%s\n" %(chr) + if track_type == 'coverage' or track_type == 'both': + coverage = int(fields[coverage_col]) + line1 = "%s\t%s\n" %(start,coverage) + fout.write("%s%s" %(header,line1)) + if track_type == 'snp' or track_type == 'both': + a = t = g = c = 0 + fout_a.write("%s" %(header)) + fout_t.write("%s" %(header)) + fout_g.write("%s" %(header)) + fout_c.write("%s" %(header)) + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + prev_start = start + except ValueError: + if start != prev_start: + if track_type == 'coverage' or track_type == 'both': + coverage = int(fields[coverage_col]) + fout.write("%s\t%s\n" %(start,coverage)) + if track_type == 'snp' or track_type == 'both': + if a: + fout_a.write("%s\t%s\n" %(prev_start,a)) + if t: + fout_t.write("%s\t%s\n" %(prev_start,t)) + if g: + fout_g.write("%s\t%s\n" %(prev_start,g)) + if c: + fout_c.write("%s\t%s\n" %(prev_start,c)) + a = t = g = c = 0 + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + prev_start = start + else: + if track_type == 'snp' or track_type == 'both': + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + + if track_type == 'snp' or track_type == 'both': + if a: + fout_a.write("%s\t%s\n" %(prev_start,a)) + if t: + fout_t.write("%s\t%s\n" %(prev_start,t)) + if g: + fout_g.write("%s\t%s\n" %(prev_start,g)) + if c: + fout_c.write("%s\t%s\n" %(prev_start,c)) + + fout_a.seek(0) + fout_g.seek(0) + fout_t.seek(0) + fout_c.seek(0) + + if track_type == 'snp': + os.system("cat %s %s %s %s >> %s" %(fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname)) + elif track_type == 'both': + fout.seek(0) + os.system("cat %s %s %s %s %s | cat > %s" %(fout.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname)) +if __name__ == "__main__": + main() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mapping_to_ucsc.xml Mon May 19 12:33:55 2014 -0400 @@ -0,0 +1,202 @@ +<tool id="mapToUCSC" name="Format mapping data" version="1.0.0"> + <description> as UCSC custom track</description> + <command interpreter="python"> + mapping_to_ucsc.py + $out_file1 + $input + $chr_col + $coord_col + $track.track_type + #if $track.track_type == "coverage" or $track.track_type == "both" + $track.coverage_col + "${track.cname}" + "${track.cdescription}" + "${track.ccolor}" + "${track.cvisibility}" + #end if + #if $track.track_type == "snp" or $track.track_type == "both" + "${track.sdescription}" + "${track.svisibility}" + $track.col2 + #end if + </command> + <inputs> + <param format="tabular" name="input" type="data" label="Select mapping data"/> + <param name="chr_col" type="data_column" data_ref="input" label="Column for reference chromosome" /> + <param name="coord_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for reference co-ordinate" /> + <conditional name="track"> + <param name="track_type" type="select" label="Display"> + <option value="snp" selected="true">SNPs</option> + <option value="coverage">Read coverage</option> + <option value="both">Both</option> + </param> + <when value = "coverage"> + <param name="coverage_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for read coverage" /> + <param name="cname" type="text" size="15" value="User Track" label="Coverage track name"> + <validator type="length" max="15"/> + </param> + <param name="cdescription" type="text" value="User Supplied Coverage Track (from Galaxy)" label="Coverage track description"> + <validator type="length" max="60" size="15"/> + </param> + <param label="Coverage track Color" name="ccolor" type="select"> + <option selected="yes" value="0-0-0">Black</option> + <option value="255-0-0">Red</option> + <option value="0-255-0">Green</option> + <option value="0-0-255">Blue</option> + <option value="255-0-255">Magenta</option> + <option value="0-255-255">Cyan</option> + <option value="255-215-0">Gold</option> + <option value="160-32-240">Purple</option> + <option value="255-140-0">Orange</option> + <option value="255-20-147">Pink</option> + <option value="92-51-23">Dark Chocolate</option> + <option value="85-107-47">Olive green</option> + </param> + <param label="Coverage track Visibility" name="cvisibility" type="select"> + <option selected="yes" value="1">Dense</option> + <option value="2">Full</option> + <option value="3">Pack</option> + <option value="4">Squish</option> + <option value="0">Hide</option> + </param> + </when> + + <when value = "snp"> + <!-- + <param name="col1" type="data_column" data_ref="input" label="Column containing the reference nucleotide" /> + --> + <param name="col2" type="data_column" data_ref="input" label="Column containing the read nucleotide" /> + <!-- + <param name="sname" type="text" size="15" value="User Track-2" label="SNP track name"> + <validator type="length" max="15"/> + </param> + --> + <param name="sdescription" type="text" value="User Supplied Track (from Galaxy)" label="SNP track description"> + <validator type="length" max="60" size="15"/> + </param> + <param label="SNP track Visibility" name="svisibility" type="select"> + <option selected="yes" value="1">Dense</option> + <option value="2">Full</option> + <option value="3">Pack</option> + <option value="4">Squish</option> + <option value="0">Hide</option> + </param> + </when> + + <when value = "both"> + <param name="coverage_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for read coverage" /> + <param name="cname" type="text" size="15" value="User Track" label="Coverage track name"> + <validator type="length" max="15"/> + </param> + <param name="cdescription" type="text" size="15" value="User Supplied Track (from Galaxy)" label="Coverage track description"> + <validator type="length" max="60"/> + </param> + <param label="Coverage track Color" name="ccolor" type="select"> + <option selected="yes" value="0-0-0">Black</option> + <option value="255-0-0">Red</option> + <option value="0-255-0">Green</option> + <option value="0-0-255">Blue</option> + <option value="255-0-255">Magenta</option> + <option value="0-255-255">Cyan</option> + <option value="255-215-0">Gold</option> + <option value="160-32-240">Purple</option> + <option value="255-140-0">Orange</option> + <option value="255-20-147">Pink</option> + <option value="92-51-23">Dark Chocolate</option> + <option value="85-107-47">Olive green</option> + </param> + <param label="Coverage track Visibility" name="cvisibility" type="select"> + <option selected="yes" value="1">Dense</option> + <option value="2">Full</option> + <option value="3">Pack</option> + <option value="4">Squish</option> + <option value="0">Hide</option> + </param> + <!-- + <param name="col1" type="data_column" data_ref="input" label="Column containing the reference nucleotide" /> + --> + <param name="col2" type="data_column" data_ref="input" label="Column containing the read nucleotide" /> + <!-- + <param name="sname" type="text" size="15" value="User Track-2" label="SNP track name"> + <validator type="length" max="15"/> + </param> + --> + <param name="sdescription" type="text" size="15" value="User Supplied Track (from Galaxy)" label="SNP track description"> + <validator type="length" max="60"/> + </param> + <param label="SNP track Visibility" name="svisibility" type="select"> + <option selected="yes" value="1">Dense</option> + <option value="2">Full</option> + <option value="3">Pack</option> + <option value="4">Squish</option> + <option value="0">Hide</option> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data format="customtrack" name="out_file1"/> + </outputs> + + + <help> + +.. class:: infomark + +**What it does** + +This tool turns mapping data generated by short read mappers into a format that can be displayed in the UCSC genome browser as a custom track. + +----- + +.. class:: warningmark + +**Note** + +This tool requires the mapping data to contain at least the following information: + +chromosome, genome coordinate, read nucleotide (if option to display is SNPs), read coverage (if option to display is Read coverage). + +----- + +**Example** + +For the following Mapping data:: + + #chr g_start read_id read_coord g_nt read_nt qual read_coverage + chrM 1 1:29:1672:1127/1 11 G G 40 134 + chrM 1 1:32:93:933/1 4 G A 40 134 + chrM 1 1:34:116:2032/1 11 G A 40 134 + chrM 1 1:39:207:964/1 1 G G 40 134 + chrM 2 1:3:359:848/1 1 G C 40 234 + chrM 2 1:40:1435:1013/1 1 G G 40 234 + chrM 3 1:40:730:972/1 9 G G 40 334 + chrM 4 1:42:1712:921/2 31 G T 35 434 + chrM 4 1:44:1649:493/1 4 G G 40 434 + +running this tool to display both SNPs and Read coverage will return the following tracks, containing aggregated data per genome co-ordinate:: + + track type=wiggle_0 name="Coverage Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1 + variableStep chrom=chrM + 1 134 + 2 234 + 3 334 + 4 434 + track type=wiggle_0 name="Track A" description="User Supplied SNP Track (from Galaxy)" color=255,0,0 visibility=1 + variableStep chrom=chrM + 1 2 + track type=wiggle_0 name="Track T" description="User Supplied SNP Track (from Galaxy)" color=0,255,0 visibility=1 + variableStep chrom=chrM + 4 1 + track type=wiggle_0 name="Track G" description="User Supplied SNP Track (from Galaxy)" color=0,0,255 visibility=1 + variableStep chrom=chrM + 1 2 + 2 1 + 3 1 + 4 1 + track type=wiggle_0 name="Track C" description="User Supplied SNP Track (from Galaxy)" color=255,0,255 visibility=1 + variableStep chrom=chrM + 2 1 + + </help> +</tool>