Mercurial > repos > rsajulga > proteomic_cravat_score_and_annotate
view cravat_submit.py @ 2:676c8be98be4 draft
Uploaded
author | rsajulga |
---|---|
date | Thu, 17 May 2018 22:32:25 -0400 |
parents | 7ebdd4ac13a2 |
children |
line wrap: on
line source
import requests import json import time import urllib import sys import csv import re import math from difflib import SequenceMatcher from xml.etree import ElementTree as ET import sqlite3 try: input_filename = sys.argv[1] input_select_bar = sys.argv[2] GRCh_build = sys.argv[3] probed_filename = sys.argv[4] output_filename = sys.argv[5] file_3 = sys.argv[6] file_4 = sys.argv[7] file_5 = sys.argv[8] except: # Filenames for testing. input_filename = 'input/Galaxy68-[VCF-BEDintersect__on_data_65_and_data_6].vcf' input_select_bar = 'VEST' GRCh_build = 'GRCh38' output_filename = 'combined_variants.tsv' probed_filename = 'input/Galaxy66-[PepPointer].bed' file_3 = 'output/Gene_Level_Analysis.tsv' file_4 = 'output/Variant_Non-coding.Result.tsv' file_5 = 'output/Input_Errors.Result.tsv' matches_filename = 'matches.tsv' def getSequence(transcript_id): server = 'http://rest.ensembl.org' ext = '/sequence/id/' + transcript_id + '?content-type=text/x-seqxml%2Bxml;multiple_sequences=1;type=protein' req = requests.get(server+ext, headers={ "Content-Type" : "text/plain"}) if not req.ok: return None root = ET.fromstring(req.content) for child in root.iter('AAseq'): return child.text write_header = True GRCh37hg19 = 'off' if GRCh_build == 'GRCh37': GRCh37hg19 = 'on' #plugs in params to given URL submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19}) #Makes the data a json dictionary, takes out only the job ID jobid = json.loads(submit.text)['jobid'] #out_file.write(jobid) submitted = json.loads(submit.text)['status'] #out_file.write('\t' + submitted) input_file = open(input_filename) # Loads the proBED file as a list. if (probed_filename != 'None'): proBED = [] with open(probed_filename) as tsvin: tsvreader = csv.reader(tsvin, delimiter='\t') for i, row in enumerate(tsvreader): proBED.append(row) #loops until we find a status equal to Success, then breaks while True: check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid}) status = json.loads(check.text)['status'] resultfileurl = json.loads(check.text)['resultfileurl'] #out_file.write(str(status) + ', ') if status == 'Success': #out_file.write('\t' + resultfileurl) break else: time.sleep(2) #out_file.write('\n') #creates three files file_1 = 'Variant_Result.tsv' file_2 = 'Additional_Details.tsv' #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv' #Downloads the tabular results urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1) urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2) urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3) urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4) urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5) #opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') tsvout = csv.writer(tsvout, delimiter='\t') headers = [] duplicate_indices = [] n = 12 #Index for proteogenomic column start reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)') SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)') pep_muts = {} pep_map = {} rows = [] for row in tsvreader_2: if row != [] and row[0][0] != '#': #checks if the row begins with input line if row[0] == 'Input line': vad_headers = row else: # Initially screens through the output Variant Additional Details to catch mutations on same peptide region genchrom = row[vad_headers.index('Chromosome')] genpos = int(row[vad_headers.index('Position')]) aa_change = row[vad_headers.index('Protein sequence change')] input_line = row[vad_headers.index('Input line')] for peptide in proBED: pepseq = peptide[3] pepchrom = peptide[0] pepposA = int(peptide[1]) pepposB = int(peptide[2]) if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB: strand = row[vad_headers.index('Strand')] transcript_strand = row[vad_headers.index('S.O. transcript strand')] # Calculates the position of the variant amino acid(s) on peptide if transcript_strand == strand: aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1) if strand == '-' or transcript_strand == '-' or aa_peppos >= len(pepseq): aa_peppos = int(math.floor((pepposB - genpos)/3.0)) if pepseq in pep_muts: if aa_change not in pep_muts[pepseq]: pep_muts[pepseq][aa_change] = [aa_peppos] else: if aa_peppos not in pep_muts[pepseq][aa_change]: pep_muts[pepseq][aa_change].append(aa_peppos) else: pep_muts[pepseq] = {aa_change : [aa_peppos]} # Stores the intersect information by mapping Input Line (CRAVAT output) to peptide sequence. if input_line in pep_map: if pepseq not in pep_map[input_line]: pep_map[input_line].append(pepseq) else: pep_map[input_line] = [pepseq] with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout: tsvreader_1 = csv.reader(tsvin_1, delimiter='\t') tsvreader_2 = csv.reader(tsvin_2, delimiter='\t') tsvout = csv.writer(tsvout, delimiter='\t') headers = [] #loops through each row in the Variant Additional Details (VAD) file for row in tsvreader_2: #sets row_2 equal to the same row in Variant Result (VR) file row_2 = tsvreader_1.next() #checks if row is empty or if the first term contains '#' if row == [] or row[0][0] == '#': tsvout.writerow(row) else: if row[0] == 'Input line': #Goes through each value in the headers list in VAD for value in row: #Adds each value into headers headers.append(value) #Loops through the Keys in VR for i,value in enumerate(row_2): #Checks if the value is already in headers if value in headers: duplicate_indices.append(i) continue #else adds the header to headers else: headers.append(value) #Adds appropriate headers when proteomic input is supplied if (probed_filename != 'None'): headers.insert(n, 'Variant peptide') headers.insert(n, 'Reference peptide') tsvout.writerow(headers) else: cells = [] #Goes through each value in the next list for value in row: #adds it to cells cells.append(value) #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file) for i,value in enumerate(row_2): #adds in the rest of the values to cells if i not in duplicate_indices: # Skips the initial 11 columns and the VEST p-value (already in VR file) cells.append(value) # Verifies the peptides intersected previously through sequences obtained from Ensembl's API if (probed_filename != 'None'): cells.insert(n,'') cells.insert(n,'') input_line = cells[headers.index('Input line')] if input_line in pep_map: pepseq = pep_map[input_line][0] aa_changes = pep_muts[pepseq] transcript_id = cells[headers.index('S.O. transcript')] ref_fullseq = getSequence(transcript_id) # Checks the other S.O. transcripts if the primary S.O. transcript has no sequence available if not ref_fullseq: transcripts = cells[headers.index('S.O. all transcripts')] for transcript in transcripts.split(','): if transcript: mat = SOtranscripts.search(transcript) ref_fullseq = getSequence(mat.group(1)) if ref_fullseq: aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} break # Resubmits the previous transcripts without extensions if all S.O. transcripts fail to provide a sequence if not ref_fullseq: transcripts = cells[headers.index('S.O. all transcripts')] for transcript in transcripts.split(','): if transcript: mat = SOtranscripts.search(transcript) ref_fullseq = getSequence(mat.group(1).split('.')[0]) if ref_fullseq: aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} break if ref_fullseq: # Sorts the amino acid changes positions = {} for aa_change in aa_changes: m = reg_seq_change.search(aa_change) aa_protpos = int(m.group(2)) aa_peppos = aa_changes[aa_change][0] aa_startpos = aa_protpos - aa_peppos - 1 if aa_startpos in positions: positions[aa_startpos].append(aa_change) else: positions[aa_startpos] = [aa_change] # Goes through the sorted categories to mutate the Ensembl peptide (uses proBED peptide as a reference) for pep_protpos in positions: ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)] muts = positions[pep_protpos] options = [] mut_seq = ref_seq for mut in muts: m = reg_seq_change.search(mut) ref_aa = m.group(1) mut_pos = int(m.group(2)) alt_aa = m.group(3) pep_mutpos = mut_pos - pep_protpos - 1 if ref_seq[pep_mutpos] == ref_aa and (pepseq[pep_mutpos] == alt_aa or pepseq[pep_mutpos] == ref_aa): if pepseq[pep_mutpos] == ref_aa: mut_seq = mut_seq[:pep_mutpos] + ref_aa + mut_seq[pep_mutpos+1:] else: mut_seq = mut_seq[:pep_mutpos] + alt_aa + mut_seq[pep_mutpos+1:] else: break # Adds the mutated peptide and reference peptide if mutated correctly if pepseq == mut_seq: cells[n+1] = pepseq cells[n] = ref_seq #print cells tsvout.writerow(cells) #a = 'col1\tcol2\tcol3' #header_list = a.split('\t') #loop through the two results, when you first hit header you print out the headers in tabular form #Print out each header only once #Combine both headers into one output file #loop through the rest of the data and assign each value to its assigned header #combine this all into one output file