view alignment/phytab_mview.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line source

#!/usr/bin/env python
## usage: ./phytab_mview.py -i <phytabinput> -d <protein|dna> 
## splits up an aligned phytab file containing multiple genes into
## individual files to run mview

import sys, os, os.path, tempfile, shutil, re, shlex, subprocess
import optparse
from multiprocessing import Pool

#define some variables to call later:

directory = ""
extension = ".fs"
html_header = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML>
<HEAD>
<TITLE></TITLE>
</HEAD>
<BODY BGCOLOR='white' TEXT='black' LINK='blue' ALINK='red' VLINK='purple'>
<H1>PHYTAB MVIEW ALIGNMENT VIEWER</H1>
<PRE>Select from below to view aligned sequence as HTML (left) or FASTA (right) in browser.
</PRE>
<table border="1" bordercolor="#000000" style="background-color:#FFFFFF" width="300" cellpadding="3" cellspacing="0">
	<tr>
		<td>mview HTML</td>
		<!--<td>FASTA</td>-->
	</tr>"""
html_close =  """
<P><SMALL><A HREF="http://bio-mview.sourceforge.net">MView</A> </SMALL><BR>
</BODY>
</HTML>"""	

#define some functions to call in 'main':
#    first, sanitize problematic characters
def unescape(string):
  mapped_chars = {
        '>': '__gt__',
        '<': '__lt__',
        "'": '__sq__',
        '"': '__dq__',
        '[': '__ob__',
        ']': '__cb__',
        '{': '__oc__',
        '}': '__cc__',
        '@': '__at__',
        '\n': '__cn__',
        '\r': '__cr__',
        '\t': '__tc__',
        '#': '__pd__'
        }

  for key, value in mapped_chars.iteritems():
    string = string.replace(value, key)

  return string
#  next, define tabular --> fasta conversion
class Sequence:            
  def __init__(self, string):
    lis = string.split()
    self.species = lis[0]
    self.family = lis[1]
    self.name = lis[2]
    self.header = ' '.join(lis[:-1])
    self.sequence = lis[-1]
    self.string = string

  def printFASTA(self):
    return '> ' + self.header + '\n' + self.sequence + '\n'

#  then define function to apply preceding conversion method to all genes
#  (creates separate file for each gene)
def saveMulti(tabFile):
  with open(tabFile) as f:
    for line in f:
      seq = Sequence(line)
      with open(seq.family + extension, "a") as p:
        p.write(seq.printFASTA())
                
#subroutine to write main HTML output containing valid urls to mview htmls
def resultsto_output_html(html_mainoutput,basepath):
  htmllist = [f for f in os.listdir(basepath) if 'html' in f]
  sortedhtmllist = sorted(htmllist)
  html = open(html_mainoutput, 'w')
  html.write(html_header)
  for f in sortedhtmllist:
    f_path = os.path.join(basepath,f)
    htmllink = '<tr><td><a href="' + f + '">' + f + '</a></td>\n' 
    html.write(htmllink)
  html.write(html_close)
  html.close()

def main():
#the command line arguments from the xml:
  """
           ##params for galaxy wrapper
           $input 
           $dna  
           $output 
           "$output.extra_files_path"  #save the htmlfiles here
  """ 
  inputphytabfile = sys.argv[1]
  dnaorprotein = sys.argv[2]
  output = sys.argv[3]
  extra_files_path = sys.argv[4]
  
  inputFile = unescape(inputphytabfile)
  ##make the fasta files
  saveMulti(inputFile) 

  #prepare to put mview htmls into valid path

  if not os.path.isdir(extra_files_path):  #make filepath for alns to go with galaxy info
      os.makedirs(extra_files_path)    
  
  # execute mview on each fasta, storing in extra_files_path as <gene_aln>.html
  list_of_fastafiles = [f for f in os.listdir(os.getcwd()) if 'fs' in f]
  sortedfileorder = sorted(list_of_fastafiles)
  for gene_aln in sortedfileorder:
    result_htmlfile = gene_aln + '.html'
    result_path = os.path.join(extra_files_path,result_htmlfile) #puts the htmls in permanent Galaxy directory
    if dnaorprotein is 'dna':
      cmd = subprocess.Popen(['mview','-in','pearson','-DNA','-bold','-coloring','group','-html','head', gene_aln],stdout=subprocess.PIPE)
    else:
      cmd = subprocess.Popen(['mview','-in','pearson','-bold','-coloring','group','-html','head', gene_aln],stdout=subprocess.PIPE)
    cmd.wait()  
    out =  cmd.communicate()[0]
     
    with open(result_path, 'wb') as fileout:
      fileout.write(out)
    ##now have # of gene htmls in extra_files_path/
    
  #write main html output  
  resultsto_output_html(output,extra_files_path)


if __name__ == '__main__':
    main()