view cravat_submit.py @ 2:676c8be98be4 draft

Uploaded
author rsajulga
date Thu, 17 May 2018 22:32:25 -0400
parents 7ebdd4ac13a2
children
line wrap: on
line source

import requests
import json
import time
import urllib
import sys
import csv
import re
import math
from difflib import SequenceMatcher
from xml.etree import ElementTree as ET
import sqlite3

try:
    input_filename = sys.argv[1]
    input_select_bar = sys.argv[2]
    GRCh_build = sys.argv[3]
    probed_filename = sys.argv[4]
    output_filename = sys.argv[5]
    file_3 = sys.argv[6]
    file_4 = sys.argv[7]
    file_5 = sys.argv[8]
except:
    # Filenames for testing.
    input_filename = 'input/Galaxy68-[VCF-BEDintersect__on_data_65_and_data_6].vcf'
    input_select_bar = 'VEST'
    GRCh_build = 'GRCh38'
    output_filename = 'combined_variants.tsv'
    probed_filename = 'input/Galaxy66-[PepPointer].bed'
    file_3 = 'output/Gene_Level_Analysis.tsv'
    file_4 = 'output/Variant_Non-coding.Result.tsv'
    file_5 = 'output/Input_Errors.Result.tsv'
    matches_filename = 'matches.tsv'

def getSequence(transcript_id):
    server = 'http://rest.ensembl.org'
    ext = '/sequence/id/' + transcript_id + '?content-type=text/x-seqxml%2Bxml;multiple_sequences=1;type=protein'
    req = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})
    
    if not req.ok:
        return None
    
    root = ET.fromstring(req.content)
    for child in root.iter('AAseq'):
        return child.text


write_header = True

GRCh37hg19 = 'off'
if GRCh_build == 'GRCh37':
    GRCh37hg19 = 'on'

#plugs in params to given URL
submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19})

#Makes the data a json dictionary, takes out only the job ID
jobid = json.loads(submit.text)['jobid']

#out_file.write(jobid)    
submitted = json.loads(submit.text)['status']
#out_file.write('\t' + submitted)

input_file = open(input_filename)

# Loads the proBED file as a list. 
if (probed_filename != 'None'):
    proBED = []
    with open(probed_filename) as tsvin:
        tsvreader = csv.reader(tsvin, delimiter='\t')
        for i, row in enumerate(tsvreader):
            proBED.append(row)
  
#loops until we find a status equal to Success, then breaks
while True:
    check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid})
    status = json.loads(check.text)['status']
    resultfileurl = json.loads(check.text)['resultfileurl']
    #out_file.write(str(status) + ', ')
    if status == 'Success':
        #out_file.write('\t' + resultfileurl)
        break
    else:
        time.sleep(2)
        
#out_file.write('\n')

#creates three files
file_1 = 'Variant_Result.tsv'
file_2 = 'Additional_Details.tsv'
#file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv'

#Downloads the tabular results
urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1)
urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2)
urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3)
urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4)
urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5)

#opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer
with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
    tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')        
    tsvout = csv.writer(tsvout, delimiter='\t')

    headers = []
    duplicate_indices = []
    n = 12 #Index for proteogenomic column start
    reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)')
    SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)')
    pep_muts = {}
    pep_map = {}
    rows = []

    for row in tsvreader_2:
        if row != [] and row[0][0] != '#':
        #checks if the row begins with input line
            if row[0] == 'Input line':
                vad_headers = row
            else:
                # Initially screens through the output Variant Additional Details to catch mutations on same peptide region
                genchrom = row[vad_headers.index('Chromosome')]
                genpos = int(row[vad_headers.index('Position')])
                aa_change = row[vad_headers.index('Protein sequence change')]
                input_line = row[vad_headers.index('Input line')]
                
                for peptide in proBED:
                    pepseq = peptide[3]
                    pepchrom = peptide[0]
                    pepposA = int(peptide[1])
                    pepposB = int(peptide[2])
                    if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB:
                        strand = row[vad_headers.index('Strand')]
                        transcript_strand = row[vad_headers.index('S.O. transcript strand')]

                        # Calculates the position of the variant amino acid(s) on peptide
                        if transcript_strand == strand:                               
                            aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1)
                        if strand == '-' or transcript_strand == '-' or aa_peppos >= len(pepseq):
                            aa_peppos = int(math.floor((pepposB - genpos)/3.0))
                        if pepseq in pep_muts:
                            if aa_change not in pep_muts[pepseq]:
                                pep_muts[pepseq][aa_change] = [aa_peppos]
                            else:
                                if aa_peppos not in pep_muts[pepseq][aa_change]:
                                    pep_muts[pepseq][aa_change].append(aa_peppos)
                        else:
                            pep_muts[pepseq] = {aa_change : [aa_peppos]}
                        # Stores the intersect information by mapping Input Line (CRAVAT output) to peptide sequence.
                        if input_line in pep_map:
                            if pepseq not in pep_map[input_line]:
                                pep_map[input_line].append(pepseq)
                        else:
                            pep_map[input_line] = [pepseq]

with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
    tsvreader_1 = csv.reader(tsvin_1, delimiter='\t')
    tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
    tsvout = csv.writer(tsvout, delimiter='\t')

    headers = []
            
    #loops through each row in the Variant Additional Details (VAD) file
    for row in tsvreader_2:
        
        #sets row_2 equal to the same row in Variant Result (VR) file
        row_2 = tsvreader_1.next()
        #checks if row is empty or if the first term contains '#'
        if row == [] or row[0][0] == '#':
            tsvout.writerow(row)
        else:
            if row[0] == 'Input line': 
                #Goes through each value in the headers list in VAD
                for value in row:   
                    #Adds each value into headers 
                    headers.append(value)
                #Loops through the Keys in VR
                for i,value in enumerate(row_2):
                    #Checks if the value is already in headers
                    if value in headers:
                        duplicate_indices.append(i)
                        continue
                    #else adds the header to headers
                    else:
                        headers.append(value)
                #Adds appropriate headers when proteomic input is supplied
                if (probed_filename != 'None'):
                    headers.insert(n, 'Variant peptide')
                    headers.insert(n, 'Reference peptide')
                tsvout.writerow(headers)
            else:                        
                cells = []
                #Goes through each value in the next list
                for value in row:
                    #adds it to cells
                    cells.append(value)
                #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file)
                for i,value in enumerate(row_2):
                    #adds in the rest of the values to cells
                    if i not in duplicate_indices:
                        # Skips the initial 11 columns and the VEST p-value (already in VR file)
                        cells.append(value)

                # Verifies the peptides intersected previously through sequences obtained from Ensembl's API
                if (probed_filename != 'None'):
                    cells.insert(n,'')
                    cells.insert(n,'')
                    input_line = cells[headers.index('Input line')]
                    if input_line in pep_map:
                        pepseq = pep_map[input_line][0]
                        aa_changes = pep_muts[pepseq]
                        transcript_id = cells[headers.index('S.O. transcript')]
                        ref_fullseq = getSequence(transcript_id)
                        # Checks the other S.O. transcripts if the primary S.O. transcript has no sequence available
                        if not ref_fullseq:
                            transcripts = cells[headers.index('S.O. all transcripts')]
                            for transcript in transcripts.split(','):
                                if transcript:
                                    mat = SOtranscripts.search(transcript)
                                    ref_fullseq = getSequence(mat.group(1))
                                    if ref_fullseq:
                                        aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
                                        break
                        # Resubmits the previous transcripts without extensions if all S.O. transcripts fail to provide a sequence
                        if not ref_fullseq:
                            transcripts = cells[headers.index('S.O. all transcripts')]
                            for transcript in transcripts.split(','):
                                if transcript:
                                    mat = SOtranscripts.search(transcript)
                                    ref_fullseq = getSequence(mat.group(1).split('.')[0])
                                    if ref_fullseq:
                                        aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
                                        break
                        if ref_fullseq:
                            # Sorts the amino acid changes
                            positions = {}
                            for aa_change in aa_changes:
                                m = reg_seq_change.search(aa_change)
                                aa_protpos = int(m.group(2))
                                aa_peppos = aa_changes[aa_change][0]
                                aa_startpos = aa_protpos - aa_peppos - 1
                                if aa_startpos in positions:
                                    positions[aa_startpos].append(aa_change)
                                else:
                                    positions[aa_startpos] = [aa_change]
                            # Goes through the sorted categories to mutate the Ensembl peptide (uses proBED peptide as a reference)
                            for pep_protpos in positions:
                                ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)]
                                muts = positions[pep_protpos]
                                options = []
                                mut_seq = ref_seq
                                for mut in muts:
                                    m = reg_seq_change.search(mut)
                                    ref_aa = m.group(1)
                                    mut_pos = int(m.group(2))
                                    alt_aa = m.group(3)
                                    pep_mutpos = mut_pos - pep_protpos - 1
                                    if ref_seq[pep_mutpos] == ref_aa and (pepseq[pep_mutpos] == alt_aa or pepseq[pep_mutpos] == ref_aa):
                                        if pepseq[pep_mutpos] == ref_aa:
                                            mut_seq = mut_seq[:pep_mutpos] + ref_aa + mut_seq[pep_mutpos+1:]
                                        else:
                                            mut_seq = mut_seq[:pep_mutpos] + alt_aa + mut_seq[pep_mutpos+1:]
                                    else:
                                        break
                                # Adds the mutated peptide and reference peptide if mutated correctly
                                if pepseq == mut_seq:
                                    cells[n+1] = pepseq
                                    cells[n] = ref_seq
                #print  cells
                tsvout.writerow(cells)



            
    

#a = 'col1\tcol2\tcol3'
#header_list = a.split('\t')

#loop through the two results, when you first hit header you print out the headers in tabular form
#Print out each header only once
#Combine both headers into one output file
#loop through the rest of the data and assign each value to its assigned header
#combine this all into one output file