Mercurial > repos > rsajulga > proteomic_cravat_score_and_annotate

--- a/cravat_submit.py	Tue Apr 10 15:54:25 2018 -0400
+++ b/cravat_submit.py	Thu May 17 22:32:25 2018 -0400
@@ -5,30 +5,44 @@
 import sys
 import csv
 import re
+import math
+from difflib import SequenceMatcher
+from xml.etree import ElementTree as ET
+import sqlite3

 try:
     input_filename = sys.argv[1]
     input_select_bar = sys.argv[2]
     GRCh_build = sys.argv[3]
-    psm_filename = sys.argv[4]
+    probed_filename = sys.argv[4]
     output_filename = sys.argv[5]
     file_3 = sys.argv[6]
     file_4 = sys.argv[7]
     file_5 = sys.argv[8]
 except:
-    input_filename = '1.) Galaxy2-[Human_Vcf_MCF7]-minimum.vcf'
-    input_filename = 'input/[tgriffin_cguerrer_20160726_MCF7_RNAseq_01_S13_R1_001.vcf].vcf'
+    # Filenames for testing.
+    input_filename = 'input/Galaxy68-[VCF-BEDintersect__on_data_65_and_data_6].vcf'
     input_select_bar = 'VEST'
     GRCh_build = 'GRCh38'
     output_filename = 'combined_variants.tsv'
-    psm_filename = 'input/[ERLIC_MCF7_110kb_R123-CustomProDB_RNA-Seq_cRAP_DB.psm-report].tabular'
-    file_3 = 'output/' + time.strftime("%H:%M") + '_Z_Gene_Level_Analysis.tsv'
-    file_4 = 'output/' + time.strftime("%H:%M") + '_Z_Variant_Non-coding.Result.tsv'
-    file_5 = 'output/' + time.strftime("%H:%M") + '_Z_Input_Errors.Result.tsv'
+    probed_filename = 'input/Galaxy66-[PepPointer].bed'
+    file_3 = 'output/Gene_Level_Analysis.tsv'
+    file_4 = 'output/Variant_Non-coding.Result.tsv'
+    file_5 = 'output/Input_Errors.Result.tsv'
+    matches_filename = 'matches.tsv'

+def getSequence(transcript_id):
+    server = 'http://rest.ensembl.org'
+    ext = '/sequence/id/' + transcript_id + '?content-type=text/x-seqxml%2Bxml;multiple_sequences=1;type=protein'
+    req = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})
+
+    if not req.ok:
+        return None
+
+    root = ET.fromstring(req.content)
+    for child in root.iter('AAseq'):
+        return child.text

-#in_file = open('input_call.txt', "r")
-#out_file = open('output_call.txt', "w")

 write_header = True

@@ -36,21 +50,26 @@
 if GRCh_build == 'GRCh37':
     GRCh37hg19 = 'on'

-# http://staging.cravat.us/CRAVAT/rest/service/submit:
-
 #plugs in params to given URL
 submit = requests.post('http://staging.cravat.us/CRAVAT/rest/service/submit', files={'inputfile':open(input_filename)}, data={'email':'znylund@insilico.us.com', 'analyses': input_select_bar, 'hg19': GRCh37hg19})

 #Makes the data a json dictionary, takes out only the job ID
 jobid = json.loads(submit.text)['jobid']
+
 #out_file.write(jobid)
 submitted = json.loads(submit.text)['status']
 #out_file.write('\t' + submitted)

 input_file = open(input_filename)
-is_comment_line = re.compile(".*#+.*")

-
+# Loads the proBED file as a list.
+if (probed_filename != 'None'):
+    proBED = []
+    with open(probed_filename) as tsvin:
+        tsvreader = csv.reader(tsvin, delimiter='\t')
+        for i, row in enumerate(tsvreader):
+            proBED.append(row)
+
 #loops until we find a status equal to Success, then breaks
 while True:
     check = requests.get('http://staging.cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid})
@@ -62,134 +81,194 @@
         break
     else:
         time.sleep(2)
-
+
 #out_file.write('\n')

 #creates three files
-file_1 = time.strftime("%H:%M") + '_Z_Variant_Result.tsv'
-file_2 = time.strftime("%H:%M") + '_Z_Additional_Details.tsv'
+file_1 = 'Variant_Result.tsv'
+file_2 = 'Additional_Details.tsv'
 #file_3 = time.strftime("%H:%M") + 'Combined_Variant_Results.tsv'

-#Download the two results
+#Downloads the tabular results
 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", file_1)
 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", file_2)
 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Gene_Level_Analysis.Result.tsv", file_3)
 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Non-coding.Result.tsv", file_4)
 urllib.urlretrieve("http://staging.cravat.us/CRAVAT/results/" + jobid + "/" + "Input_Errors.Result.tsv", file_5)

-headers = []
-duplicates = []
+#opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer
+with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
+    tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
+    tsvout = csv.writer(tsvout, delimiter='\t')
+
+    headers = []
+    duplicate_indices = []
+    n = 12 #Index for proteogenomic column start
+    reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)')
+    SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)')
+    pep_muts = {}
+    pep_map = {}
+    rows = []

-#opens the Variant Result file and the Variant Additional Details file as csv readers, then opens the output file (galaxy) as a writer
+    for row in tsvreader_2:
+        if row != [] and row[0][0] != '#':
+        #checks if the row begins with input line
+            if row[0] == 'Input line':
+                vad_headers = row
+            else:
+                # Initially screens through the output Variant Additional Details to catch mutations on same peptide region
+                genchrom = row[vad_headers.index('Chromosome')]
+                genpos = int(row[vad_headers.index('Position')])
+                aa_change = row[vad_headers.index('Protein sequence change')]
+                input_line = row[vad_headers.index('Input line')]
+
+                for peptide in proBED:
+                    pepseq = peptide[3]
+                    pepchrom = peptide[0]
+                    pepposA = int(peptide[1])
+                    pepposB = int(peptide[2])
+                    if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB:
+                        strand = row[vad_headers.index('Strand')]
+                        transcript_strand = row[vad_headers.index('S.O. transcript strand')]
+
+                        # Calculates the position of the variant amino acid(s) on peptide
+                        if transcript_strand == strand:
+                            aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1)
+                        if strand == '-' or transcript_strand == '-' or aa_peppos >= len(pepseq):
+                            aa_peppos = int(math.floor((pepposB - genpos)/3.0))
+                        if pepseq in pep_muts:
+                            if aa_change not in pep_muts[pepseq]:
+                                pep_muts[pepseq][aa_change] = [aa_peppos]
+                            else:
+                                if aa_peppos not in pep_muts[pepseq][aa_change]:
+                                    pep_muts[pepseq][aa_change].append(aa_peppos)
+                        else:
+                            pep_muts[pepseq] = {aa_change : [aa_peppos]}
+                        # Stores the intersect information by mapping Input Line (CRAVAT output) to peptide sequence.
+                        if input_line in pep_map:
+                            if pepseq not in pep_map[input_line]:
+                                pep_map[input_line].append(pepseq)
+                        else:
+                            pep_map[input_line] = [pepseq]
+
 with open(file_1) as tsvin_1, open(file_2) as tsvin_2, open(output_filename, 'wb') as tsvout:
     tsvreader_1 = csv.reader(tsvin_1, delimiter='\t')
     tsvreader_2 = csv.reader(tsvin_2, delimiter='\t')
-
     tsvout = csv.writer(tsvout, delimiter='\t')

-    # Processes the PSM report
-    if (psm_filename != 'None'):
-        tsvin_3 = open(psm_filename)
-        psmreader = csv.reader(tsvin_3, delimiter='\t')
-
-        psmreader.next()
-        peptide_map = {}
-        s = re.compile('[A-Z][0-9]+[A-Z]')
-        for row in psmreader:
-            pro_name = row[1]
-            pep_seq = row[2]
+    headers = []

-            prot_seq_changes = s.findall(pro_name)
+    #loops through each row in the Variant Additional Details (VAD) file
+    for row in tsvreader_2:

-            for change in prot_seq_changes:
-                if change in peptide_map:
-                    if pep_seq not in peptide_map[change].split(';'):
-                        peptide_map[change] = peptide_map[change] + ';' + pep_seq
-                else:
-                    peptide_map[change] = pep_seq
-
-    #loops through each row in the Variant Additional Details file
-
-    print 'Checkpoint 3'
-    for row in tsvreader_2:
-        #sets row_2 equal to the same row in Variant Result file
+        #sets row_2 equal to the same row in Variant Result (VR) file
         row_2 = tsvreader_1.next()
         #checks if row is empty or if the first term contains '#'
         if row == [] or row[0][0] == '#':
             tsvout.writerow(row)
         else:
-        #checks if the row begins with input line
-            if row[0] == 'Input line':
+            if row[0] == 'Input line':
                 #Goes through each value in the headers list in VAD
-                #print 'Original row'
-                #print row
-                #print row_2
                 for value in row:
                     #Adds each value into headers
                     headers.append(value)
                 #Loops through the Keys in VR
-                for value in row_2:
+                for i,value in enumerate(row_2):
                     #Checks if the value is already in headers
                     if value in headers:
+                        duplicate_indices.append(i)
                         continue
                     #else adds the header to headers
                     else:
                         headers.append(value)
-                if (psm_filename != 'None'):
-                    headers.insert(1, 'Peptide')
-                #print headers
+                #Adds appropriate headers when proteomic input is supplied
+                if (probed_filename != 'None'):
+                    headers.insert(n, 'Variant peptide')
+                    headers.insert(n, 'Reference peptide')
                 tsvout.writerow(headers)
-            else:
-
+            else:
                 cells = []
-                #Inserts a peptide column into the row
-                if (psm_filename != 'None'):
-                    if row[12] in peptide_map:
-                        row.insert(1, peptide_map[row[12]])
-                    else:
-                        row.insert(1, '')
-
                 #Goes through each value in the next list
-                for i,value in enumerate(row):
+                for value in row:
                     #adds it to cells
                     cells.append(value)
                 #Goes through each value from the VR file after position 11 (After it is done repeating from VAD file)
-                for i,value in enumerate(row_2[11:]):
+                for i,value in enumerate(row_2):
                     #adds in the rest of the values to cells
+                    if i not in duplicate_indices:
+                        # Skips the initial 11 columns and the VEST p-value (already in VR file)
+                        cells.append(value)

-                    # Skips the 2nd VEST p-value
-                    if (i != 49 - 11):
-                        cells.append(value)
-
-                print  cells
+                # Verifies the peptides intersected previously through sequences obtained from Ensembl's API
+                if (probed_filename != 'None'):
+                    cells.insert(n,'')
+                    cells.insert(n,'')
+                    input_line = cells[headers.index('Input line')]
+                    if input_line in pep_map:
+                        pepseq = pep_map[input_line][0]
+                        aa_changes = pep_muts[pepseq]
+                        transcript_id = cells[headers.index('S.O. transcript')]
+                        ref_fullseq = getSequence(transcript_id)
+                        # Checks the other S.O. transcripts if the primary S.O. transcript has no sequence available
+                        if not ref_fullseq:
+                            transcripts = cells[headers.index('S.O. all transcripts')]
+                            for transcript in transcripts.split(','):
+                                if transcript:
+                                    mat = SOtranscripts.search(transcript)
+                                    ref_fullseq = getSequence(mat.group(1))
+                                    if ref_fullseq:
+                                        aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
+                                        break
+                        # Resubmits the previous transcripts without extensions if all S.O. transcripts fail to provide a sequence
+                        if not ref_fullseq:
+                            transcripts = cells[headers.index('S.O. all transcripts')]
+                            for transcript in transcripts.split(','):
+                                if transcript:
+                                    mat = SOtranscripts.search(transcript)
+                                    ref_fullseq = getSequence(mat.group(1).split('.')[0])
+                                    if ref_fullseq:
+                                        aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
+                                        break
+                        if ref_fullseq:
+                            # Sorts the amino acid changes
+                            positions = {}
+                            for aa_change in aa_changes:
+                                m = reg_seq_change.search(aa_change)
+                                aa_protpos = int(m.group(2))
+                                aa_peppos = aa_changes[aa_change][0]
+                                aa_startpos = aa_protpos - aa_peppos - 1
+                                if aa_startpos in positions:
+                                    positions[aa_startpos].append(aa_change)
+                                else:
+                                    positions[aa_startpos] = [aa_change]
+                            # Goes through the sorted categories to mutate the Ensembl peptide (uses proBED peptide as a reference)
+                            for pep_protpos in positions:
+                                ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)]
+                                muts = positions[pep_protpos]
+                                options = []
+                                mut_seq = ref_seq
+                                for mut in muts:
+                                    m = reg_seq_change.search(mut)
+                                    ref_aa = m.group(1)
+                                    mut_pos = int(m.group(2))
+                                    alt_aa = m.group(3)
+                                    pep_mutpos = mut_pos - pep_protpos - 1
+                                    if ref_seq[pep_mutpos] == ref_aa and (pepseq[pep_mutpos] == alt_aa or pepseq[pep_mutpos] == ref_aa):
+                                        if pepseq[pep_mutpos] == ref_aa:
+                                            mut_seq = mut_seq[:pep_mutpos] + ref_aa + mut_seq[pep_mutpos+1:]
+                                        else:
+                                            mut_seq = mut_seq[:pep_mutpos] + alt_aa + mut_seq[pep_mutpos+1:]
+                                    else:
+                                        break
+                                # Adds the mutated peptide and reference peptide if mutated correctly
+                                if pepseq == mut_seq:
+                                    cells[n+1] = pepseq
+                                    cells[n] = ref_seq
+                #print  cells
                 tsvout.writerow(cells)


-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-