# HG changeset patch
# User galaxyp
# Date 1534436855 14400
# Node ID 2c7bcc1219fc4f01cb554335305a16cf064f5808
# Parent 83181dabeb905166e792003e78c3a7ac37f19b19
Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
diff -r 83181dabeb90 -r 2c7bcc1219fc ._cravatp_submit.py
Binary file ._cravatp_submit.py has changed
diff -r 83181dabeb90 -r 2c7bcc1219fc ._cravatp_submit.xml
Binary file ._cravatp_submit.xml has changed
diff -r 83181dabeb90 -r 2c7bcc1219fc cravat_submit.py
--- a/cravat_submit.py Fri May 18 13:25:29 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,287 +0,0 @@
-import requests
-import json
-import time
-import urllib
-import sys
-import csv
-import re
-import math
-from difflib import SequenceMatcher
-from xml.etree import ElementTree as ET
-import sqlite3
-
-try:
- input_filename = sys.argv[1]
- input_select_bar = sys.argv[2]
- GRCh_build = sys.argv[3]
- probed_filename = sys.argv[4]
- output_filename = sys.argv[5]
- file_3 = sys.argv[6]
- file_4 = sys.argv[7]
- file_5 = sys.argv[8]
-except:
- # Filenames for testing.
- input_filename = 'test-data/[VCF-BEDintersect__on_data_65_and_data_6].vcf'
- probed_filename = 'test-data/[PepPointer].bed'
- input_select_bar = 'VEST'
- GRCh_build = 'GRCh38'
- output_filename = 'combined_variants.tsv'
- file_3 = 'test-results/Gene_Level_Analysis.tsv'
- file_4 = 'test-results/Variant_Non-coding.Result.tsv'
- file_5 = 'test-results/Input_Errors.Result.tsv'
- matches_filename = 'matches.tsv'
-
-def getSequence(transcript_id):
- server = 'http://rest.ensembl.org'
- ext = '/sequence/id/' + transcript_id + '?content-type=text/x-seqxml%2Bxml;multiple_sequences=1;type=protein'
- req = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})
-
- if not req.ok:
- return None
-
- root = ET.fromstring(req.content)
- for child in root.iter('AAseq'):
- return child.text
-
-
-write_header = True
-
-GRCh37hg19 = 'off'
-if GRCh_build == 'GRCh37':
- GRCh37hg19 = 'on'
-
-#plugs in params to given URL
-submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19})
-
-#Makes the data a json dictionary, takes out only the job ID
-jobid = json.loads(submit.text)['jobid']
-
-#out_file.write(jobid)
-submitted = json.loads(submit.text)['status']
-#out_file.write('\t' + submitted)
-
-input_file = open(input_filename)
-
-# Loads the proBED file as a list.
-if (probed_filename != 'None'):
- proBED = []
- with open(probed_filename) as tsvin:
- tsvreader = csv.reader(tsvin, delimiter='\t')
- for i, row in enumerate(tsvreader):
- proBED.append(row)
-
-#loops until we find a status equal to Success, then breaks
-while True:
- check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid})
- status = json.loads(check.text)['status']
- resultfileurl = json.loads(check.text)['resultfileurl']
- #out_file.write(str(status) + ', ')
- if status == 'Success':
- #out_file.write('\t' + resultfileurl)
- break
- else:
- time.sleep(2)
-
-#out_file.write('\n')
-
-#creates three files
-file_1 = 'Variant_Result.tsv'
-file_2 = 'Additional_Details.tsv'
-#file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv'
-
-#Downloads the tabular results
-urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1)
-urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2)
-urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3)
-urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4)
-urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5)
-
-#opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer
-with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
- tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
- tsvout = csv.writer(tsvout, delimiter='\t')
-
- headers = []
- duplicate_indices = []
- n = 12 #Index for proteogenomic column start
- reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)')
- SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)')
- pep_muts = {}
- pep_map = {}
- rows = []
-
- for row in tsvreader_2:
- if row != [] and row[0][0] != '#':
- #checks if the row begins with input line
- if row[0] == 'Input line':
- vad_headers = row
- else:
- # Initially screens through the output Variant Additional Details to catch mutations on same peptide region
- genchrom = row[vad_headers.index('Chromosome')]
- genpos = int(row[vad_headers.index('Position')])
- aa_change = row[vad_headers.index('Protein sequence change')]
- input_line = row[vad_headers.index('Input line')]
-
- for peptide in proBED:
- pepseq = peptide[3]
- pepchrom = peptide[0]
- pepposA = int(peptide[1])
- pepposB = int(peptide[2])
- if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB:
- strand = row[vad_headers.index('Strand')]
- transcript_strand = row[vad_headers.index('S.O. transcript strand')]
-
- # Calculates the position of the variant amino acid(s) on peptide
- if transcript_strand == strand:
- aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1)
- if strand == '-' or transcript_strand == '-' or aa_peppos >= len(pepseq):
- aa_peppos = int(math.floor((pepposB - genpos)/3.0))
- if pepseq in pep_muts:
- if aa_change not in pep_muts[pepseq]:
- pep_muts[pepseq][aa_change] = [aa_peppos]
- else:
- if aa_peppos not in pep_muts[pepseq][aa_change]:
- pep_muts[pepseq][aa_change].append(aa_peppos)
- else:
- pep_muts[pepseq] = {aa_change : [aa_peppos]}
- # Stores the intersect information by mapping Input Line (CRAVAT output) to peptide sequence.
- if input_line in pep_map:
- if pepseq not in pep_map[input_line]:
- pep_map[input_line].append(pepseq)
- else:
- pep_map[input_line] = [pepseq]
-
-with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
- tsvreader_1 = csv.reader(tsvin_1, delimiter='\t')
- tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
- tsvout = csv.writer(tsvout, delimiter='\t')
-
- headers = []
-
- #loops through each row in the Variant Additional Details (VAD) file
- for row in tsvreader_2:
-
- #sets row_2 equal to the same row in Variant Result (VR) file
- row_2 = tsvreader_1.next()
- #checks if row is empty or if the first term contains '#'
- if row == [] or row[0][0] == '#':
- tsvout.writerow(row)
- else:
- if row[0] == 'Input line':
- #Goes through each value in the headers list in VAD
- for value in row:
- #Adds each value into headers
- headers.append(value)
- #Loops through the Keys in VR
- for i,value in enumerate(row_2):
- #Checks if the value is already in headers
- if value in headers:
- duplicate_indices.append(i)
- continue
- #else adds the header to headers
- else:
- headers.append(value)
- #Adds appropriate headers when proteomic input is supplied
- if (probed_filename != 'None'):
- headers.insert(n, 'Variant peptide')
- headers.insert(n, 'Reference peptide')
- tsvout.writerow(headers)
- else:
- cells = []
- #Goes through each value in the next list
- for value in row:
- #adds it to cells
- cells.append(value)
- #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file)
- for i,value in enumerate(row_2):
- #adds in the rest of the values to cells
- if i not in duplicate_indices:
- # Skips the initial 11 columns and the VEST p-value (already in VR file)
- cells.append(value)
-
- # Verifies the peptides intersected previously through sequences obtained from Ensembl's API
- if (probed_filename != 'None'):
- cells.insert(n,'')
- cells.insert(n,'')
- input_line = cells[headers.index('Input line')]
- if input_line in pep_map:
- pepseq = pep_map[input_line][0]
- aa_changes = pep_muts[pepseq]
- transcript_id = cells[headers.index('S.O. transcript')]
- ref_fullseq = getSequence(transcript_id)
- # Checks the other S.O. transcripts if the primary S.O. transcript has no sequence available
- if not ref_fullseq:
- transcripts = cells[headers.index('S.O. all transcripts')]
- for transcript in transcripts.split(','):
- if transcript:
- mat = SOtranscripts.search(transcript)
- ref_fullseq = getSequence(mat.group(1))
- if ref_fullseq:
- aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
- break
- # Resubmits the previous transcripts without extensions if all S.O. transcripts fail to provide a sequence
- if not ref_fullseq:
- transcripts = cells[headers.index('S.O. all transcripts')]
- for transcript in transcripts.split(','):
- if transcript:
- mat = SOtranscripts.search(transcript)
- ref_fullseq = getSequence(mat.group(1).split('.')[0])
- if ref_fullseq:
- aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
- break
- if ref_fullseq:
- # Sorts the amino acid changes
- positions = {}
- for aa_change in aa_changes:
- m = reg_seq_change.search(aa_change)
- aa_protpos = int(m.group(2))
- aa_peppos = aa_changes[aa_change][0]
- aa_startpos = aa_protpos - aa_peppos - 1
- if aa_startpos in positions:
- positions[aa_startpos].append(aa_change)
- else:
- positions[aa_startpos] = [aa_change]
- # Goes through the sorted categories to mutate the Ensembl peptide (uses proBED peptide as a reference)
- for pep_protpos in positions:
- ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)]
- muts = positions[pep_protpos]
- options = []
- mut_seq = ref_seq
- for mut in muts:
- m = reg_seq_change.search(mut)
- ref_aa = m.group(1)
- mut_pos = int(m.group(2))
- alt_aa = m.group(3)
- pep_mutpos = mut_pos - pep_protpos - 1
- if ref_seq[pep_mutpos] == ref_aa and (pepseq[pep_mutpos] == alt_aa or pepseq[pep_mutpos] == ref_aa):
- if pepseq[pep_mutpos] == ref_aa:
- mut_seq = mut_seq[:pep_mutpos] + ref_aa + mut_seq[pep_mutpos+1:]
- else:
- mut_seq = mut_seq[:pep_mutpos] + alt_aa + mut_seq[pep_mutpos+1:]
- else:
- break
- # Adds the mutated peptide and reference peptide if mutated correctly
- if pepseq == mut_seq:
- cells[n+1] = pepseq
- cells[n] = ref_seq
- #print cells
- tsvout.writerow(cells)
-
-
-
-
-
-
-#a = 'col1\tcol2\tcol3'
-#header_list = a.split('\t')
-
-#loop through the two results, when you first hit header you print out the headers in tabular form
-#Print out each header only once
-#Combine both headers into one output file
-#loop through the rest of the data and assign each value to its assigned header
-#combine this all into one output file
-
-
-
-
-
diff -r 83181dabeb90 -r 2c7bcc1219fc cravat_submit.xml
--- a/cravat_submit.xml Fri May 18 13:25:29 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-
- Submits, checks for, and retrieves data for cancer annotation
- cravat_submit.py $input $dropdown $GRCh $psm $Variant $Gene $Noncoding $Error
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- This tool submits, checks for, and retrieves data for cancer annotation from the CRAVAT platform at cravat.us.
-
-
-
diff -r 83181dabeb90 -r 2c7bcc1219fc cravatp_submit.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravatp_submit.py Thu Aug 16 12:27:35 2018 -0400
@@ -0,0 +1,390 @@
+# -*- coding: utf-8 -*-
+#
+# Author: Ray W. Sajulga
+#
+#
+
+import requests # pipenv requests
+import json
+import time
+import urllib
+import sys
+import csv
+import re
+import math
+import argparse
+from xml.etree import ElementTree as ET
+from zipfile import ZipFile
+try: #Python 3
+ from urllib.request import urlopen
+except ImportError: #Python 2
+ from urllib2 import urlopen
+from io import BytesIO
+
+# initializes blank parameters
+chasm_classifier = ''
+probed_filename = None
+intersected_only = False
+vcf_output = None
+analysis_type = None
+
+# # Testing Command
+# python cravatp_submit.py test-data/Freebayes_two-variants.vcf GRCh38
+# test-data/variant.tsv test-data/gene.tsv test-data/noncoding.tsv
+# test-data/error.tsv CHASM -—classifier Breast -—proBED
+# test-data/MCF7_proBed.bed
+parser = argparse.ArgumentParser()
+parser.add_argument('cravatInput',help='The filename of the input '
+ 'CRAVAT-formatted tabular file '
+ '(e.g., VCF)')
+parser.add_argument('GRCh', help='The name of the human reference '
+ 'genome used for annotation: '
+ 'GRCh38/hg38 or GRCh37/hg19')
+parser.add_argument('variant', help='The filename of the output '
+ 'variant file')
+parser.add_argument('gene', help='The filename of the output gene '
+ 'variant report')
+parser.add_argument('noncoding', help='The filename of the output '
+ 'non-coding variant report')
+parser.add_argument('error', help='The filename of the output error '
+ 'file')
+parser.add_argument('analysis', help='The machine-learning algorithm '
+ 'used for CRAVAT annotation (VEST'
+ ' and/or CHASM)')
+parser.add_argument('--classifier', help='The cancer classifier for the'
+ ' CHASM algorithm')
+parser.add_argument('--proBED', help='The filename of the proBED file '
+ 'containing peptides with genomic '
+ 'coordinates')
+parser.add_argument('--intersectOnly', help='Specifies whether to '
+ 'analyze only variants '
+ 'intersected between the '
+ 'CRAVAT input and proBED '
+ 'file')
+parser.add_argument('--vcfOutput', help='The output filename of the '
+ 'intersected VCF file')
+
+# assigns parsed arguments to appropriate variables
+args = parser.parse_args()
+input_filename = args.cravatInput
+GRCh_build = args.GRCh
+output_filename = args.variant
+file_3 = args.gene
+file_4 = args.noncoding
+file_5 = args.error
+if args.analysis != 'None':
+ analysis_type = args.analysis
+if args.classifier:
+ chasm_classifier = args.classifier
+if args.proBED:
+ probed_filename = args.proBED
+if args.intersectOnly:
+ intersected_only = args.intersectOnly
+if args.vcfOutput:
+ vcf_output = args.vcfOutput
+
+if analysis_type and '+' in analysis_type:
+ analysis_type = 'CHASM;VEST'
+
+# obtains the transcript's protein sequence using Ensembl API
+def getSequence(transcript_id):
+ server = 'http://rest.ensembl.org'
+ ext = ('/sequence/id/' + transcript_id
+ + '?content-type=text/x-seqxml%2Bxml;'
+ 'multiple_sequences=1;type=protein')
+ req = requests.get(server+ext,
+ headers={ "Content-Type" : "text/plain"})
+
+ if not req.ok:
+ return None
+
+ root = ET.fromstring(req.content)
+ for child in root.iter('AAseq'):
+ return child.text
+
+# parses the proBED file as a list.
+def loadProBED():
+ proBED = []
+ with open(probed_filename) as tsvin:
+ tsvreader = csv.reader(tsvin, delimiter='\t')
+ for i, row in enumerate(tsvreader):
+ proBED.append(row)
+ return proBED
+
+write_header = True
+
+
+# Creates an VCF file that only contains variants that overlap with the
+# proteogenomic input (proBED) file if the user specifies that they want
+# to only include intersected variants or if they want to receive the
+# intersected VCF as well.
+if probed_filename and (vcf_output or intersected_only == 'true'):
+ proBED = loadProBED()
+ if not vcf_output:
+ vcf_output = 'intersected_input.vcf'
+ with open(input_filename) as tsvin, open(vcf_output, 'wb') as tsvout:
+ tsvreader = csv.reader(tsvin, delimiter='\t')
+ tsvout = csv.writer(tsvout, delimiter='\t', escapechar=' ',
+ quoting=csv.QUOTE_NONE)
+
+ for row in tsvreader:
+ if row == [] or row[0][0] == '#':
+ tsvout.writerow(row)
+ else:
+ genchrom = row[0]
+ genpos = int(row[1])
+
+ for peptide in proBED:
+ pepchrom = peptide[0]
+ pepposA = int(peptide[1])
+ pepposB = int(peptide[2])
+ if (genchrom == pepchrom and
+ pepposA <= genpos and
+ genpos <= pepposB):
+ tsvout.writerow(row)
+ break
+if intersected_only == 'true':
+ input_filename = vcf_output
+
+# sets up the parameters for submission to the CRAVAT API
+parameters = {'email':'rsajulga@umn.edu',
+ 'hg19': 'on' if GRCh_build == 'GRCh37' else 'off',
+ 'functionalannotation': 'on',
+ 'tsvreport' : 'on',
+ 'mupitinput' : 'on'}
+if analysis_type:
+ parameters['analyses'] = analysis_type
+if chasm_classifier:
+ parameters['chasmclassifier'] = chasm_classifier
+
+# plugs in params to given URL
+submit = requests.post('http://www.cravat.us/CRAVAT/rest/service/submit',
+ files = {'inputfile':open(input_filename)},
+ data = parameters)
+
+# makes the data a json dictionary; takes out only the job ID
+jobid = json.loads(submit.text)['jobid']
+
+# loops until we find a status equal to Success, then breaks
+while True:
+ check = requests.get(
+ 'http://www.cravat.us/CRAVAT/rest/service/status',
+ params = {'jobid' : jobid})
+ status = json.loads(check.text)['status']
+ resultfileurl = json.loads(check.text)['resultfileurl']
+ #out_file.write(str(status) + ', ')
+ if status == 'Success':
+ #out_file.write('\t' + resultfileurl)
+ break
+ else:
+ time.sleep(2)
+
+# obtains the zipfile created by CRAVAT and loads the variants and VAD
+# file for processing
+r = requests.get(resultfileurl, stream=True)
+url = urlopen(resultfileurl)
+zipfile = ZipFile(BytesIO(r.content))
+variants = zipfile.open(jobid + '/Variant.Result.tsv').readlines()
+vad = zipfile.open(jobid + '/Variant_Additional_Details.Result.tsv').readlines()
+
+# reads and writes the gene, noncoding, and error files
+open(file_3, 'wb').write(zipfile.read(jobid + '/Gene_Level_Analysis.Result.tsv'))
+open(file_4, 'wb').write(zipfile.read(jobid + '/Variant_Non-coding.Result.tsv'))
+open(file_5, 'wb').write(zipfile.read(jobid + '/Input_Errors.Result.tsv'))
+
+
+
+if probed_filename and not vcf_output:
+ proBED = loadProBED()
+
+if probed_filename:
+ with open(output_filename, 'w') as tsvout:
+ tsvout = csv.writer(tsvout,
+ delimiter='\t',
+ escapechar=' ',
+ quoting=csv.QUOTE_NONE)
+ n = 11 #Index for proteogenomic column start
+ reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)')
+ SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)')
+ pep_muts = {}
+ pep_map = {}
+ rows = []
+ for row in vad:
+ row = row.decode().split('\t')
+ row[-1] = row[-1].replace('\n','')
+ if row and row[0] and not row[0].startswith('#'):
+ # checks if the row begins with input line
+ if row[0].startswith('Input line'):
+ vad_headers = row
+
+ else:
+ # Initially screens through the output Variant
+ # Additional Details to catch mutations on
+ # same peptide region
+ genchrom = row[vad_headers.index('Chromosome')]
+ genpos = int(row[vad_headers.index('Position')])
+ aa_change = row[vad_headers.index('Protein sequence change')]
+ input_line = row[vad_headers.index('Input line')]
+
+ for peptide in proBED:
+ pepseq = peptide[3]
+ pepchrom = peptide[0]
+ pepposA = int(peptide[1])
+ pepposB = int(peptide[2])
+ if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB:
+ strand = row[vad_headers.index('Strand')]
+ transcript_strand = row[vad_headers.index('S.O. transcript strand')]
+
+ # Calculates the position of the variant
+ # amino acid(s) on peptide
+ if transcript_strand == strand:
+ aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1)
+ if (strand == '-' or transcript_strand == '-'
+ or aa_peppos >= len(pepseq)):
+ aa_peppos = int(math.floor((pepposB - genpos)/3.0))
+ if pepseq in pep_muts:
+ if aa_change not in pep_muts[pepseq]:
+ pep_muts[pepseq][aa_change] = [aa_peppos]
+ else:
+ if aa_peppos not in pep_muts[pepseq][aa_change]:
+ pep_muts[pepseq][aa_change].append(aa_peppos)
+ else:
+ pep_muts[pepseq] = {aa_change : [aa_peppos]}
+ # Stores the intersect information by mapping
+ # Input Line (CRAVAT output) to peptide sequence.
+ if input_line in pep_map:
+ if pepseq not in pep_map[input_line]:
+ pep_map[input_line].append(pepseq)
+ else:
+ pep_map[input_line] = [pepseq]
+ # TODO: Need to obtain strand information as
+ # well i.e., positive (+) or negative (-)
+
+
+with open(output_filename, 'w') as tsvout:
+ tsvout = csv.writer(tsvout, delimiter='\t', escapechar='',
+ quoting=csv.QUOTE_NONE)
+ headers = []
+ duplicate_indices = []
+
+ # loops through each row in the Variant Additional Details (VAD) file
+ for x, row in enumerate(variants):
+ row = row.decode().split('\t')
+ row[-1] = row[-1].replace('\n','')
+ # sets row_2 equal to the same row in Variant Result (VR) file
+ row_2 = vad[x].decode().split('\t')
+ row_2[-1] = row_2[-1].replace('\n','')
+
+ # checks if row is empty or if the first term contains '#'
+ if not row or not row[0] or row[0].startswith('#'):
+ if row[0]:
+ tsvout.writerow(row)
+ else:
+ if row[0].startswith('Input line'):
+ # goes through each value in the headers list in VAD
+ headers = row
+ # loops through the Keys in VR
+ for i,value in enumerate(row_2):
+ #Checks if the value is already in headers
+ if value in headers:
+ duplicate_indices.append(i)
+ continue
+ #else adds the header to headers
+ else:
+ headers.append(value)
+ # adds appropriate headers when proteomic input is supplied
+ if probed_filename:
+ headers.insert(n, 'Variant peptide')
+ headers.insert(n, 'Reference peptide')
+ tsvout.writerow(headers)
+ else:
+ cells = []
+ # goes through each value in the next list
+ for value in row:
+ #adds it to cells
+ cells.append(value)
+ # goes through each value from the VR file after position
+ # 11 (After it is done repeating from VAD file)
+ for i,value in enumerate(row_2):
+ # adds in the rest of the values to cells
+ if i not in duplicate_indices:
+ # Skips the initial 11 columns and the
+ # VEST p-value (already in VR file)
+ cells.append(value)
+
+ # Verifies the peptides intersected previously through
+ # sequences obtained from Ensembl's API
+ if probed_filename:
+ cells.insert(n,'')
+ cells.insert(n,'')
+ input_line = cells[headers.index('Input line')]
+ if input_line in pep_map:
+ pepseq = pep_map[input_line][0]
+ aa_changes = pep_muts[pepseq]
+ transcript_id = cells[headers.index('S.O. transcript')]
+ ref_fullseq = getSequence(transcript_id)
+ # Checks the other S.O. transcripts if the primary
+ # S.O. transcript has no sequence available
+ if not ref_fullseq:
+ transcripts = cells[headers.index('S.O. all transcripts')]
+ for transcript in transcripts.split(','):
+ if transcript:
+ mat = SOtranscripts.search(transcript)
+ ref_fullseq = getSequence(mat.group(1))
+ if ref_fullseq:
+ aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
+ break
+ # Resubmits the previous transcripts without
+ # extensions if all S.O. transcripts fail to
+ # provide a sequence
+ if not ref_fullseq:
+ transcripts = cells[headers.index('S.O. all transcripts')]
+ for transcript in transcripts.split(','):
+ if transcript:
+ mat = SOtranscripts.search(transcript)
+ ref_fullseq = getSequence(mat.group(1).split('.')[0])
+ if ref_fullseq:
+ aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
+ break
+ if ref_fullseq:
+ # Sorts the amino acid changes
+ positions = {}
+ for aa_change in aa_changes:
+ m = reg_seq_change.search(aa_change)
+ aa_protpos = int(m.group(2))
+ aa_peppos = aa_changes[aa_change][0]
+ aa_startpos = aa_protpos - aa_peppos - 1
+ if aa_startpos in positions:
+ positions[aa_startpos].append(aa_change)
+ else:
+ positions[aa_startpos] = [aa_change]
+ # Goes through the sorted categories to mutate the Ensembl peptide
+ # (uses proBED peptide as a reference)
+ for pep_protpos in positions:
+ ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)]
+ muts = positions[pep_protpos]
+ options = []
+ mut_seq = ref_seq
+ for mut in muts:
+ m = reg_seq_change.search(mut)
+ ref_aa = m.group(1)
+ mut_pos = int(m.group(2))
+ alt_aa = m.group(3)
+ pep_mutpos = mut_pos - pep_protpos - 1
+ if (ref_seq[pep_mutpos] == ref_aa
+ and (pepseq[pep_mutpos] == alt_aa
+ or pepseq[pep_mutpos] == ref_aa)):
+ if pepseq[pep_mutpos] == ref_aa:
+ mut_seq = (mut_seq[:pep_mutpos] + ref_aa
+ + mut_seq[pep_mutpos+1:])
+ else:
+ mut_seq = (mut_seq[:pep_mutpos] + alt_aa
+ + mut_seq[pep_mutpos+1:])
+ else:
+ break
+ # Adds the mutated peptide and reference peptide if mutated correctly
+ if pepseq == mut_seq:
+ cells[n+1] = pepseq
+ cells[n] = ref_seq
+ tsvout.writerow(cells)
+
diff -r 83181dabeb90 -r 2c7bcc1219fc cravatp_submit.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cravatp_submit.xml Thu Aug 16 12:27:35 2018 -0400
@@ -0,0 +1,296 @@
+
+ | Submits, intersects, checks for, and retrieves data for cancer annotation.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ proteo['proteoInput'] == 'yes' and proteo['output_vcf']
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 10.1158/0008-5472.CAN-17-0338
+ 10.1186/s13059-017-1377-x
+
+
diff -r 83181dabeb90 -r 2c7bcc1219fc test-data/._variant.tsv
Binary file test-data/._variant.tsv has changed
diff -r 83181dabeb90 -r 2c7bcc1219fc test-data/Freebayes.vcf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Freebayes.vcf Thu Aug 16 12:27:35 2018 -0400
@@ -0,0 +1,548 @@
+##fileformat=VCFv4.2
+##fileDate=20180518
+##source=freeBayes v1.1.0-46-g8d2b3a0-dirty
+##reference=/panfs/roc/rissdb/galaxy/genomes/hg38/seq/hg38.fa
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=
+##contig=