Mercurial > repos > in_silico > cravat_score_and_annotate
changeset 18:25f77adaf9d9 draft
Uploaded
author | in_silico |
---|---|
date | Thu, 16 Aug 2018 15:09:58 -0400 |
parents | 9985359fa7ff |
children | 275d45d14350 |
files | cravat_convert/__pycache__/base_converter.cpython-36.pyc cravat_convert/__pycache__/vcf_converter.cpython-36.pyc cravat_convert/base_converter.py cravat_convert/cravat_convert.py cravat_convert/cravat_convert.xml cravat_convert/vcf_converter.py cravat_submit/Z_Additional_Details2018-08-16_15-00-12_.tsv cravat_submit/Z_Variant_Result2018-08-16_15-00-12_.tsv cravat_submit/cravat_submit.py cravat_submit/cravat_submit.xml |
diffstat | 10 files changed, 365 insertions(+), 241 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cravat_convert/base_converter.py Thu Aug 16 15:09:58 2018 -0400 @@ -0,0 +1,22 @@ +class BaseConverter(object): + def __init__(self): + self.format_name = None + def check_format(self,*args,**kwargs): + err_msg = 'Converter for %s format has no method check_format' %\ + self.format_name + raise NotImplementedError(err_msg) + def setup(self,*args,**kwargs): + err_msg = 'Converter for %s format has no method setup' %\ + self.format_name + raise NotImplementedError(err_msg) + def convert_line(self,*args,**kwargs): + err_msg = 'Converter for %s format has no method convert_line' %\ + self.format_name + raise NotImplementedError(err_msg) + + +class BadFormatError(Exception): + def __init__(self, message, errors=None): + super(BadFormatError, self).__init__(message) + # Support for custom error codes, if added later + self.errors = errors \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cravat_convert/cravat_convert.py Thu Aug 16 15:09:58 2018 -0400 @@ -0,0 +1,80 @@ +from __future__ import print_function +import os +import argparse +from vcf_converter import CravatConverter + +def get_vcf_mapping(): + """ : VCF Headers mapped to their index position in a row of VCF values. + : These are only the mandatory columns, per the VCF spec. + """ + return { + 'CHROM': 0, + 'POS': 1, + 'ID': 2, + 'REF': 3, + 'ALT': 4, + 'QUAL': 5, + 'FILTER': 6, + 'INFO': 7 + } + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--input', + '-i', + required = True, + help='Input path to a VCF file for conversion',) + parser.add_argument('--output', + '-o', + default = None, + help = 'Output path to write the cravat file to') + return parser.parse_args() + + +def convert(in_path, out_path=None, cr_sep='\t', cr_newline='\n'): + """ : Convert a VCF file to a Cravat file. + : Arguments: + : in_path: <str> path to input vcf file + : out_path: <str> path to output cravat file. Will defualt to cravat_converted.txt in the input directory. + : cr_sep: <str> the value delimiter for the output cravat file. Default value of '\\t'. + : out_newline: <str> the newline delimiter in the output cravat file. Default of '\\n' + """ + if not out_path: + base, _ = os.path.split(in_path) + out_path = os.path.join(base, "cravat_converted.txt") + + with open(in_path, 'r') as in_file, \ + open(out_path, 'w') as out_file: + + # cr_count will be used to generate the 'TR' field of the cravat rows (first header) + cr_count = 0 + # VCF lines are always assumed to be '+' strand, as VCF doesn't specify that attribute + strand = '+' + # VCF converter. Adjusts position, reference, and alternate for Cravat formatting. + converter = CravatConverter() + # A dictionary of mandatory vcf headers mapped to their row indices + vcf_mapping = get_vcf_mapping() + + for line in in_file: + if line.startswith("#"): + continue + line = line.strip().split() + # row is dict of VCF headers mapped to corresponding values of this line + row = { header: line[index] for header, index in vcf_mapping.items() } + for alt in row["ALT"].split(","): + new_pos, new_ref, new_alt = converter.extract_vcf_variant(strand, row["POS"], row["REF"], alt) + new_pos, new_ref, new_alt = str(new_pos), str(new_ref), str(new_alt) + cr_line = cr_sep.join([ + 'TR' + str(cr_count), row['CHROM'], new_pos, strand, new_ref, new_alt, row['ID'] + ]) + out_file.write(cr_line + cr_newline) + cr_count += 1 + + +if __name__ == "__main__": + cli_args = get_args() + if cli_args.output == None: + base, _ = os.path.split(cli_args.input) + cli_args.output = os.path.join(base, "cravat_converted.txt") + convert(in_path = cli_args.input, out_path = cli_args.output)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cravat_convert/cravat_convert.xml Thu Aug 16 15:09:58 2018 -0400 @@ -0,0 +1,20 @@ +<tool id="cravat_convert" name="CRAVAT Convert" version="1.0.0"> + <description>Converts a VCF format file to a Cravat format file</description> + <command interpreter="python">cravat_convert.py -i $input -o $output</command> + + <inputs> + <param format="tabular" name="input" type="data" label="Source file"/> + </inputs> + + <outputs> + <data format="tabular" name="output" /> + </outputs> + + <!-- <tests></tests> --> + + <help> + Converts a VCF format file to a Cravat format file + </help> + +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cravat_convert/vcf_converter.py Thu Aug 16 15:09:58 2018 -0400 @@ -0,0 +1,243 @@ +""" +A module originally obtained from the cravat package. Modified to use in the vcf +converter galaxy tool. + + +Register of changes made (Chris Jacoby): + 1) Changed imports as galaxy tool won't have access to complete cravat python package + 2) Defined BadFormatError in BaseConverted file, as I didn't have the BadFormatError module +""" + +from base_converter import BaseConverter, BadFormatError +import re + +class CravatConverter(BaseConverter): + + def __init__(self): + self.format_name = 'vcf' + self.samples = [] + self.var_counter = 0 + self.addl_cols = [{'name':'phred', + 'title':'Phred', + 'type':'string'}, + {'name':'filter', + 'title':'VCF filter', + 'type':'string'}, + {'name':'zygosity', + 'title':'Zygosity', + 'type':'string'}, + {'name':'alt_reads', + 'title':'Alternate reads', + 'type':'int'}, + {'name':'tot_reads', + 'title':'Total reads', + 'type':'int'}, + {'name':'af', + 'title':'Variant allele frequency', + 'type':'float'}] + + def check_format(self, f): + return f.readline().startswith('##fileformat=VCF') + + def setup(self, f): + + vcf_line_no = 0 + for line in f: + vcf_line_no += 1 + if len(line) < 6: + continue + if line[:6] == '#CHROM': + toks = re.split('\s+', line.rstrip()) + if len(toks) > 8: + self.samples = toks[9:] + break + + def convert_line(self, l): + if l.startswith('#'): return None + self.var_counter += 1 + toks = l.strip('\r\n').split('\t') + all_wdicts = [] + if len(toks) < 8: + raise BadFormatError('Wrong VCF format') + [chrom, pos, tag, ref, alts, qual, filter, info] = toks[:8] + if tag == '': + raise BadFormatError('ID column is blank') + elif tag == '.': + tag = 'VAR' + str(self.var_counter) + if chrom[:3] != 'chr': + chrom = 'chr' + chrom + alts = alts.split(',') + len_alts = len(alts) + if len(toks) == 8: + for altno in range(len_alts): + wdict = None + alt = alts[altno] + newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt) + wdict = {'tags':tag, + 'chrom':chrom, + 'pos':newpos, + 'ref_base':newref, + 'alt_base':newalt, + 'sample_id':'no_sample', + 'phred': qual, + 'filter': filter} + all_wdicts.append(wdict) + elif len(toks) > 8: + sample_datas = toks[9:] + genotype_fields = {} + genotype_field_no = 0 + for genotype_field in toks[8].split(':'): + genotype_fields[genotype_field] = genotype_field_no + genotype_field_no += 1 + if not ('GT' in genotype_fields): + raise BadFormatError('No GT Field') + gt_field_no = genotype_fields['GT'] + for sample_no in range(len(sample_datas)): + sample = self.samples[sample_no] + sample_data = sample_datas[sample_no].split(':') + gts = {} + for gt in sample_data[gt_field_no].replace('/', '|').split('|'): + if gt == '.': + continue + else: + gts[int(gt)] = True + for gt in sorted(gts.keys()): + wdict = None + if gt == 0: + continue + else: + alt = alts[gt - 1] + newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt) + zyg = self.homo_hetro(sample_data[gt_field_no]) + depth, alt_reads, af = self.extract_read_info(sample_data, gt, gts, genotype_fields) + + wdict = {'tags':tag, + 'chrom':chrom, + 'pos':newpos, + 'ref_base':newref, + 'alt_base':newalt, + 'sample_id':sample, + 'phred': qual, + 'filter': filter, + 'zygosity': zyg, + 'tot_reads': depth, + 'alt_reads': alt_reads, + 'af': af, + } + all_wdicts.append(wdict) + return all_wdicts + + #The vcf genotype string has a call for each allele separated by '\' or '/' + #If the call is the same for all allels, return 'hom' otherwise 'het' + def homo_hetro(self, gt_str): + if '.' in gt_str: + return ''; + + gts = gt_str.strip().replace('/', '|').split('|') + for gt in gts: + if gt != gts[0]: + return 'het' + return 'hom' + + #Extract read depth, allele count, and allele frequency from optional VCR information + def extract_read_info (self, sample_data, gt, gts, genotype_fields): + depth = '' + alt_reads = '' + ref_reads = '' + af = '' + + #AD contains 2 values usually ref count and alt count unless there are + #multiple alts then it will have alt 1 then alt 2. + if 'AD' in genotype_fields and genotype_fields['AD'] <= len(sample_data): + if 0 in gts.keys(): + #if part of the genotype is reference, then AD will have #ref reads, #alt reads + ref_reads = sample_data[genotype_fields['AD']].split(',')[0] + alt_reads = sample_data[genotype_fields['AD']].split(',')[1] + elif gt == max(gts.keys()): + #if geontype has multiple alt bases, then AD will have #alt1 reads, #alt2 reads + alt_reads = sample_data[genotype_fields['AD']].split(',')[1] + else: + alt_reads = sample_data[genotype_fields['AD']].split(',')[0] + + if 'DP' in genotype_fields and genotype_fields['DP'] <= len(sample_data): + depth = sample_data[genotype_fields['DP']] + elif alt_reads != '' and ref_reads != '': + #if DP is not present but we have alt and ref reads count, dp = ref+alt + depth = int(alt_reads) + int(ref_reads) + + if 'AF' in genotype_fields and genotype_fields['AF'] <= len(sample_data): + af = float(sample_data[genotype_fields['AF']] ) + elif depth != '' and alt_reads != '': + #if AF not specified, calc it from alt and ref reads + af = float(alt_reads) / float(depth) + + return depth, alt_reads, af + + def extract_vcf_variant (self, strand, pos, ref, alt): + + reflen = len(ref) + altlen = len(alt) + + # Returns without change if same single nucleotide for ref and alt. + if reflen == 1 and altlen == 1 and ref == alt: + return pos, ref, alt + + # Trimming from the start and then the end of the sequence + # where the sequences overlap with the same nucleotides + new_ref2, new_alt2, new_pos = \ + self.trimming_vcf_input(ref, alt, pos, strand) + + if new_ref2 == '': + new_ref2 = '-' + if new_alt2 == '': + new_alt2 = '-' + + return new_pos, new_ref2, new_alt2 + + # This function looks at the ref and alt sequences and removes + # where the overlapping sequences contain the same nucleotide. + # This trims from the end first but does not remove the first nucleotide + # because based on the format of VCF input the + # first nucleotide of the ref and alt sequence occur + # at the position specified. + # End removed first, not the first nucleotide + # Front removed and position changed + def trimming_vcf_input(self, ref, alt, pos, strand): + pos = int(pos) + reflen = len(ref) + altlen = len(alt) + minlen = min(reflen, altlen) + new_ref = ref + new_alt = alt + new_pos = pos + # Trims from the end. Except don't remove the first nucleotide. + # 1:6530968 CTCA -> GTCTCA becomes C -> GTC. + for nt_pos in range(0, minlen - 1): + if ref[reflen - nt_pos - 1] == alt[altlen - nt_pos - 1]: + new_ref = ref[:reflen - nt_pos - 1] + new_alt = alt[:altlen - nt_pos - 1] + else: + break + new_ref_len = len(new_ref) + new_alt_len = len(new_alt) + minlen = min(new_ref_len, new_alt_len) + new_ref2 = new_ref + new_alt2 = new_alt + # Trims from the start. 1:6530968 G -> GT becomes 1:6530969 - -> T. + for nt_pos in range(0, minlen): + if new_ref[nt_pos] == new_alt[nt_pos]: + if strand == '+': + new_pos += 1 + elif strand == '-': + new_pos -= 1 + new_ref2 = new_ref[nt_pos + 1:] + new_alt2 = new_alt[nt_pos + 1:] + else: + new_ref2 = new_ref[nt_pos:] + new_alt2 = new_alt[nt_pos:] + break + return new_ref2, new_alt2, new_pos + + +if __name__ == "__main__": + c = CravatConverter() \ No newline at end of file
--- a/cravat_submit/Z_Additional_Details2018-08-16_15-00-12_.tsv Thu Aug 16 15:06:30 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -#Variant Additional Details Report -#2018-08-16 19:00:07.742990 -#CRAVAT version: hybrid -#Analysis done at http://www.cravat.us. -#Job Id: cravatgalaxy_20180816_145959 -#Input file: converted.txt -#This report shows analysis results at variant level. -#hg38 genomic. -#N/A -#For more information on CRAVAT, visit http://www.cravat.us. - -Input line ID Chromosome Position Strand Reference base(s) Alternate base(s) Sample ID HUGO symbol Sequence ontology S.O. transcript S.O. transcript strand Protein sequence change S.O. all transcripts CGC driver class CGC inheritance CGC tumor types somatic CGC tumor types germline ClinVar disease identifier ClinVar XRef COSMIC transcript COSMIC protein change COSMIC variant count ESP6500 AF (European American) ESP6500 AF (African American) HGVS Genomic HGVS Protein HGVS Protein All NCI pathway hits NCI pathway IDs NCI pathway names VEST score transcript VEST p-value VEST score (missense) VEST score (frameshift indels) VEST score (inframe indels) VEST score (stop-gain) VEST score (stop-loss) VEST score (splice site) All transcripts VEST results -1 TR1 chr22 30025797 + A T sample_1 MTMR3 MS ENST00000401950.6 + N1198I ENST00000323630.9:N1062I(MS),ENST00000351488.7:N1161I(MS),ENST00000333027.7:N1170I(MS),ENST00000406629.1:N1170I(MS),*ENST00000401950.6:N1198I(MS) 0 0.0226963 NC_000022.10:g.30025797A>T ENST00000401950.6:p.Asn1198Ile ENST00000323630.9:p.Asn1062Ile,ENST00000351488.7:p.Asn1161Ile,ENST00000333027.7:p.Asn1170Ile,ENST00000406629.1:p.Asn1170Ile,*ENST00000401950.6:p.Asn1198Ile 0 ENST00000323630.9:N1062I 0.14523 0.569 ENST00000401950.6:N1198I(0.5:0.18672),ENST00000333027.7:N1170I(0.475:0.19897),*ENST00000323630.9:N1062I(0.569:0.14523),ENST00000406629.1:N1170I(0.466:0.20362),ENST00000351488.7:N1161I(0.482:0.19512) -2 TR2 chr22 40418496 - A G sample_1 MKL1 MS ENST00000618196.4 - S683G ENST00000396617.7:S648G(MS),ENST00000614754.4:S649G(MS),ENST00000402042.5:S598G(MS),ENST00000620651.4:S599G(MS),ENST00000407029.5:S648G(MS),ENST00000355630.7:S648G(MS),*ENST00000618196.4:S683G(MS) somatic acute megakaryocytic leukaemia ENST00000355630 p.S648G (stomach 1) 1 0.396977 0.860645 NC_000022.10:g.40418496T>C ENST00000618196.4:p.Ser683Gly ENST00000396617.7:p.Ser648Gly,ENST00000614754.4:p.Ser649Gly,ENST00000402042.5:p.Ser598Gly,ENST00000620651.4:p.Ser599Gly,ENST00000407029.5:p.Ser648Gly,ENST00000355630.7:p.Ser648Gly,*ENST00000618196.4:p.Ser683Gly 1 5d9755d6-5521-11e7-8f50-0ac135e8bacf RhoA signaling pathway ENST00000620651.4:S599G 0.77158 0.104 ENST00000396617.7:S648G(0.057:0.92622),ENST00000355630.7:S648G(0.04:0.96174),ENST00000407029.5:S648G(0.043:0.95567),*ENST00000620651.4:S599G(0.104:0.77158),ENST00000618196.4:S683G(0.037:0.9662),ENST00000614754.4:S649G(0.062:0.91296),ENST00000402042.5:S598G(0.093:0.80893) -3 TR3 chr22 40419252 + C T sample_1 MKL1 MS ENST00000618196.4 - A431T ENST00000396617.7:A396T(MS),ENST00000614754.4:A397T(MS),ENST00000402042.5:A346T(MS),ENST00000620651.4:A347T(MS),ENST00000407029.5:A396T(MS),ENST00000355630.7:A396T(MS),*ENST00000618196.4:A431T(MS) somatic acute megakaryocytic leukaemia 0.00127937 0.0834091 NC_000022.10:g.40419252C>T ENST00000618196.4:p.Ala431Thr ENST00000396617.7:p.Ala396Thr,ENST00000614754.4:p.Ala397Thr,ENST00000402042.5:p.Ala346Thr,ENST00000620651.4:p.Ala347Thr,ENST00000407029.5:p.Ala396Thr,ENST00000355630.7:p.Ala396Thr,*ENST00000618196.4:p.Ala431Thr 1 5d9755d6-5521-11e7-8f50-0ac135e8bacf RhoA signaling pathway ENST00000620651.4:A347T 0.80893 0.093 ENST00000396617.7:A396T(0.039:0.96377),ENST00000355630.7:A396T(0.039:0.96377),ENST00000407029.5:A396T(0.033:0.97116),*ENST00000620651.4:A347T(0.093:0.80893),ENST00000618196.4:A431T(0.034:0.97045),ENST00000614754.4:A397T(0.029:0.97723),ENST00000402042.5:A346T(0.042:0.95749) -4 TR4 chr22 24719483 + - AGG sample_2 PIWIL3 II ENST00000332271.9 - F880SF ENST00000533313.5:F762SF(II),ENST00000527701.5:F762SF(II),*ENST00000332271.9:F880SF(II),ENST00000616349.4:F880SF(II) 0 0 NC_000022.10:g.24719482_24719483insAGG ENST00000332271.9:p.Leu879_Phe880insSer ENST00000533313.5:p.Leu761_Phe762insSer,ENST00000527701.5:p.Leu761_Phe762insSer,*ENST00000332271.9:p.Leu879_Phe880insSer,ENST00000616349.4:p.Leu879_Phe880insSer 0 ENST00000533313.5:762 0.47384 0.14 *ENST00000533313.5:762(0.14:0.47384),ENST00000332271.9:880(0.12:0.5116),ENST00000527701.5:762(0.14:0.47384),ENST00000616349.4:880(0.11:0.53414)
--- a/cravat_submit/Z_Variant_Result2018-08-16_15-00-12_.tsv Thu Aug 16 15:06:30 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -#Variant Report -#2018-08-16 19:00:07.740432 -#CRAVAT version: hybrid -#Analysis done at http://www.cravat.us. -#Job Id: cravatgalaxy_20180816_145959 -#Input file: converted.txt -#This report shows analysis results at variant level. -#hg38 genomic. -#N/A -#For more information on CRAVAT, visit http://www.cravat.us. - -Input line ID Chromosome Position Strand Reference base(s) Alternate base(s) Sample ID HUGO symbol Sequence ontology Protein sequence change ClinVar COSMIC ID COSMIC variant count (tissue) Number of samples with variant dbSNP ESP6500 AF (average) gnomAD AF Total gnomAD AF African gnomAD AF American gnomAD AF Ashkenazi Jewish gnomAD AF East Asian gnomAD AF Finnish gnomAD AF Non-Finnish European gnomAD AF Other gnomAD AF South Asian GWAS NHLBI Key (GRASP) GWAS PMID (GRASP) GWAS Phenotype (GRASP) Protein 3D variant In TCGA Mutation Cluster ncRNA Class ncRNA Name Pseudogene Pseudogene Transcript Repeat Class Repeat Family Repeat Name TARGET 1000 Genomes AF UTR/Intron UTR/Intron Gene UTR/Intron All Transcript VEST p-value VEST FDR CGL driver class -1 TR1 chr22 30025797 + A T sample_1 MTMR3 MS N1198I 1 rs75623810 0.01134815 0.00229508433352 0.0250541035459 0.000900691498634 1.57942951006e-05 0.00015475085113 0.00778754 0.14523 -2 TR2 chr22 40418496 - A G sample_1 MKL1 MS S683G COSM149301 stomach(1) 1 rs878756 0.628811 0.483227 intron MKL1 ENST00000618417.1(intron) 0.77158 -3 TR3 chr22 40419252 + C T sample_1 MKL1 MS A431T 1 rs34736200 0.042344235 0.00952521054761 0.092516658124 0.00484894792524 0.00298626318933 0.00198817839871 0.000953211839214 0.00409191060749 0.00158217417101 ../MuPIT_Interactive?gm=chr22:40419252 0.0279553 intron MKL1 ENST00000618417.1(intron) 0.80893 -4 TR4 chr22 24719483 + - AGG sample_2 PIWIL3 II F880SF 1 0.0 ../MuPIT_Interactive?gm=chr22:24719483 0 0.47384
--- a/cravat_submit/cravat_submit.py Thu Aug 16 15:06:30 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,175 +0,0 @@ -from __future__ import print_function -import requests -import json -import time -try: - # Python 3.0+ - from urllib.request import urlretrieve -except: - # Python 2.7 - from urllib import urlretrieve -import sys -import csv -import argparse - -""" -Tool's email: -usernmae: cravatgalaxy@gmail.com -password: chicken_quesadilla -""" - -email = 'cravatgalaxy@gmail.com' - -class CravatSubmissionException(Exception): - def __init__(self, message): - super(CravatSubmissionException, self).__init__(message) - -class CravatSubmission(object): - - def get_cmd_args(self, argv): - parser = argparse.ArgumentParser() - parser.add_argument('path', - help="Path to python module") - parser.add_argument('--input', - '-i', - required = True, - help='Input path to a cravat file for querying',) - parser.add_argument('--output', - '-o', - default = None, - help = 'Output path to write results from query') - parser.add_argument('--analysis', - '-a', - required=True, - help = "Cravat analysis. Should be 'VEST', 'CHASM', 'NONE', or 'VEST;CHASM'") - return parser.parse_args(argv) - - def is_valid_analysis(self, analysis): - """: Test if analysis is a recognized value""" - analyses = ["VEST", "CHASM", "VEST;CHASM", ""] - return analysis in analyses - - def is_skippable(self, s): - """: Test if a line (str or list/tuple) is skippable, a.k.a. a header or blank line""" - if not isinstance(s, str): - raise CravatSubmissionException("is_skippable accepts a string") - skippable = s == "" \ - or s[0] == "#" \ - or s.startswith('"#For more information on CRAVAT') \ - or s.isspace() - return skippable - - def parse(self, s, sep='\t'): - """: Convert string line to an array of values""" - return s.strip().split(sep) - - def unparse(self, array, sep='\t', newline='\n'): - """: Convert an array of values to a writable string line""" - return sep.join([str(i) for i in array]) + newline - - def get_headers(self, path, pattern='Input line', sep='\t'): - """: Get the headers from a Results/Details file obtained from by a finished Cravat submission""" - with open(path, 'r') as f: - for line in f: - if line.startswith(pattern): - return self.parse(line) - return None - - def create_index(self, path, prop='Input line'): - """ - : Create an index of seek/tell positions in file associated to a line value. Used to record - : the location of lines betwen two files that are associated with each other without reading entire - : files into memory. - """ - headers = self.get_headers(path) - if prop not in headers: - raise CravatSubmissionException("Index retrievel property not found in headers") - prop_loc = headers.index(prop) - index = {} - with open(path, 'r') as f: - pos = 0 - line = f.readline() - while line != "": - if not self.is_skippable(line): - parsed = self.parse(line) - if not parsed == headers: - index[parsed[prop_loc]] = pos - pos = f.tell() - line = f.readline() - return index - - def get_header_val_dict(self, headers, vals): - """: Associate an array of header keys to an array of values.""" - return { header:val for (header, val) in zip(headers, vals) } - - def write_results(self, results_path, details_path, out_path, write_headers=True): - """ - : Using the paths to the Results and Details file from a Cravat Sumbission, - : write the output file. - """ - results_headers = self.get_headers(results_path) - details_headers = self.get_headers(details_path) - if results_headers == None \ - or details_headers == None: - raise CravatSubmissionException("Unable to intepret headers in Results or Details submission files") - headers = results_headers - headers.extend(filter(lambda x: x not in headers, details_headers)) - results_index = self.create_index(results_path) - details_index = self.create_index(details_path) - with open(results_path, 'r') as results_file, \ - open(details_path, 'r') as details_file, \ - open(out_path, 'w') as out_file: - if write_headers: - out_file.write(self.unparse(headers)) - for line_id, file_pos in results_index.items(): - results_file.seek(file_pos) - results_vals = self.parse(results_file.readline()) - results_dict = self.get_header_val_dict(results_headers, results_vals) - if line_id in details_index: - details_file.seek(details_index[line_id]) - details_vals = self.parse(details_file.readline()) - details_dict = self.get_header_val_dict(details_headers, details_vals) - # On a repeated entry, the Details value will overwrite Results value - results_dict.update(details_dict) - line = [ results_dict.get(header, 'None') for header in headers ] - out_file.write(self.unparse(line)) - - def submit(self, in_path, analysis): - """: Make a POST request to submit a job to production CRAVAT server.""" - if not self.is_valid_analysis(analysis): - raise ValueError("Did not get valid analyses.") - # Create post request to submit job to CRAVAT production server - submit = requests.post('http://cravat.us/CRAVAT/rest/service/submit', - files={'inputfile' : open(in_path)}, - data={'email' : email, - 'analyses' : analysis}) - # Check job run status in loop until status is 'Success' - jobid = json.loads(submit.text)['jobid'] - while True: - check = requests.get('http://cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid}) - status = json.loads(check.text)['status'] - print(status) - if status == 'Success': - break - else: - time.sleep(2) - # Download completed job results to local files - timestamp = time.strftime("%Y-%m-%d_%H-%M-%S_") - results_path = 'Z_Variant_Result' + timestamp + '.tsv' - details_path = 'Z_Additional_Details' + timestamp + '.tsv' - urlretrieve("http://cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv", - filename=results_path) - urlretrieve("http://cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv", - filename=details_path) - return results_path, details_path - -if __name__ == "__main__": - submission = CravatSubmission() - cmd_args = submission.get_cmd_args(sys.argv) - # Galaxy converts semi-colons to X's. Switch it back - analysis = cmd_args.analysis - if analysis == "VESTXCHASM": - analysis = "VEST;CHASM" - results_path, details_path = submission.submit(cmd_args.input, analysis) - #submission.write_results('Results_test.tsv', 'Details_test.tsv', 'Out_test.tsv') - submission.write_results(results_path, details_path, cmd_args.output) \ No newline at end of file
--- a/cravat_submit/cravat_submit.xml Thu Aug 16 15:06:30 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -<tool id="cravat_submit" name="CRAVAT Submit, Check, and Retrieve" version="0.1.0"> - <description>Submits, checks for, and retrieves data for cancer annotation</description> - <command interpreter="python">cravat_submit.py --input $input --output $output --analysis $dropdown</command> - - - <inputs> - - <param format="tabular" name="input" type="data" label="Source file"> </param> - <param format="tabular" name="dropdown" type="select" label="Analysis Program"> - <option value="None">None</option> - <option value="VEST">VEST</option> - <option value="CHASM">CHASM</option> - <option value="VEST;CHASM">VEST and CHASM</option> - </param> - - - </inputs> - - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <tests> - <test> - <param name="input" value="fa_gc_content_input.fa"/> - <output name="out_file1" file="fa_gc_content_output.txt"/> - </test> - </tests> - - <help> - This tool submits, checks for, and retrieves data for cancer annotation. - </help> - -</tool>