changeset 19:275d45d14350 draft

Uploaded
author in_silico
date Thu, 16 Aug 2018 15:10:07 -0400
parents 25f77adaf9d9
children 6da9a12f04b5
files cravat_convert/__pycache__/base_converter.cpython-36.pyc cravat_convert/__pycache__/vcf_converter.cpython-36.pyc cravat_convert/base_converter.py cravat_convert/cravat_convert.py cravat_convert/cravat_convert.xml cravat_convert/vcf_converter.py cravat_submit/Z_Additional_Details2018-08-16_15-00-12_.tsv cravat_submit/Z_Variant_Result2018-08-16_15-00-12_.tsv cravat_submit/cravat_submit.py cravat_submit/cravat_submit.xml
diffstat 10 files changed, 241 insertions(+), 365 deletions(-) [+]
line wrap: on
line diff
Binary file cravat_convert/__pycache__/base_converter.cpython-36.pyc has changed
Binary file cravat_convert/__pycache__/vcf_converter.cpython-36.pyc has changed
--- a/cravat_convert/base_converter.py	Thu Aug 16 15:09:58 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,22 +0,0 @@
-class BaseConverter(object):
-    def __init__(self):
-        self.format_name = None
-    def check_format(self,*args,**kwargs):
-        err_msg = 'Converter for %s format has no method check_format' %\
-            self.format_name
-        raise NotImplementedError(err_msg)
-    def setup(self,*args,**kwargs):
-        err_msg = 'Converter for %s format has no method setup' %\
-            self.format_name
-        raise NotImplementedError(err_msg)
-    def convert_line(self,*args,**kwargs):
-        err_msg = 'Converter for %s format has no method convert_line' %\
-            self.format_name
-        raise NotImplementedError(err_msg)
-
-
-class BadFormatError(Exception):
-    def __init__(self, message, errors=None):
-        super(BadFormatError, self).__init__(message)
-        # Support for custom error codes, if added later
-        self.errors = errors
\ No newline at end of file
--- a/cravat_convert/cravat_convert.py	Thu Aug 16 15:09:58 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,80 +0,0 @@
-from __future__ import print_function
-import os
-import argparse
-from vcf_converter import CravatConverter
-
-def get_vcf_mapping():
-    """ : VCF Headers mapped to their index position in a row of VCF values.
-        : These are only the mandatory columns, per the VCF spec.
-    """
-    return {
-        'CHROM': 0,
-        'POS': 1,
-        'ID': 2,
-        'REF': 3,
-        'ALT': 4,
-        'QUAL': 5,
-        'FILTER': 6,
-        'INFO': 7
-    }
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input',
-                            '-i',
-                            required = True,
-                            help='Input path to a VCF file for conversion',)
-    parser.add_argument('--output',
-                            '-o',
-                            default = None,
-                            help = 'Output path to write the cravat file to')
-    return parser.parse_args()
-
-
-def convert(in_path, out_path=None, cr_sep='\t', cr_newline='\n'):
-    """ : Convert a VCF file to a Cravat file.
-        : Arguments:
-            : in_path: <str> path to input vcf file
-            : out_path: <str> path to output cravat file. Will defualt to cravat_converted.txt in the input directory.
-            : cr_sep: <str> the value delimiter for the output cravat file. Default value of '\\t'.
-            : out_newline: <str> the newline delimiter in the output cravat file. Default of '\\n'
-    """
-    if not out_path:
-        base, _ = os.path.split(in_path)
-        out_path = os.path.join(base, "cravat_converted.txt")
-    
-    with open(in_path, 'r') as in_file, \
-    open(out_path, 'w') as out_file:
-
-        # cr_count will be used to generate the 'TR' field of the cravat rows (first header)
-        cr_count = 0
-        # VCF lines are always assumed to be '+' strand, as VCF doesn't specify that attribute
-        strand = '+'
-        # VCF converter. Adjusts position, reference, and alternate for Cravat formatting.
-        converter = CravatConverter()
-        # A dictionary of mandatory vcf headers mapped to their row indices
-        vcf_mapping = get_vcf_mapping()
-
-        for line in in_file:
-            if line.startswith("#"):
-                continue
-            line = line.strip().split()
-            # row is dict of VCF headers mapped to corresponding values of this line
-            row = { header: line[index] for header, index in vcf_mapping.items() }
-            for alt in row["ALT"].split(","):
-                new_pos, new_ref, new_alt = converter.extract_vcf_variant(strand, row["POS"], row["REF"], alt)
-                new_pos, new_ref, new_alt = str(new_pos), str(new_ref), str(new_alt)
-                cr_line = cr_sep.join([
-                    'TR' + str(cr_count), row['CHROM'], new_pos, strand, new_ref, new_alt, row['ID']
-                ])
-                out_file.write(cr_line + cr_newline)
-                cr_count += 1
-
-
-if __name__ == "__main__":
-    cli_args = get_args()
-    if cli_args.output == None:
-        base, _ = os.path.split(cli_args.input)
-        cli_args.output = os.path.join(base, "cravat_converted.txt") 
-    convert(in_path = cli_args.input, out_path = cli_args.output)
--- a/cravat_convert/cravat_convert.xml	Thu Aug 16 15:09:58 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,20 +0,0 @@
-<tool id="cravat_convert" name="CRAVAT Convert" version="1.0.0">
-    <description>Converts a VCF format file to a Cravat format file</description>
-    <command interpreter="python">cravat_convert.py -i $input -o $output</command>
-  
-    <inputs>
-        <param format="tabular" name="input" type="data" label="Source file"/>
-    </inputs>
-  
-    <outputs>
-        <data format="tabular" name="output" />
-    </outputs>
-
-    <!-- <tests></tests> -->
-
-    <help>
-        Converts a VCF format file to a Cravat format file
-    </help>
-
-</tool>
-
--- a/cravat_convert/vcf_converter.py	Thu Aug 16 15:09:58 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,243 +0,0 @@
-"""
-A module originally obtained from the cravat package. Modified to use in the vcf
-converter galaxy tool.
-
-
-Register of changes made (Chris Jacoby):
-    1) Changed imports as galaxy tool won't have access to complete cravat python package
-    2) Defined BadFormatError in BaseConverted file, as I didn't have the BadFormatError module
-"""
-
-from base_converter import BaseConverter, BadFormatError
-import re
-
-class CravatConverter(BaseConverter):
-    
-    def __init__(self):
-        self.format_name = 'vcf'
-        self.samples = []
-        self.var_counter = 0
-        self.addl_cols = [{'name':'phred',
-                           'title':'Phred',
-                           'type':'string'},
-                          {'name':'filter',
-                           'title':'VCF filter',
-                           'type':'string'},
-                          {'name':'zygosity',
-                           'title':'Zygosity',
-                           'type':'string'},
-                          {'name':'alt_reads',
-                           'title':'Alternate reads',
-                           'type':'int'},
-                          {'name':'tot_reads',
-                           'title':'Total reads',
-                           'type':'int'},
-                          {'name':'af',
-                           'title':'Variant allele frequency',
-                           'type':'float'}]
-    
-    def check_format(self, f): 
-        return f.readline().startswith('##fileformat=VCF')
-    
-    def setup(self, f):
-        
-        vcf_line_no = 0
-        for line in f:
-            vcf_line_no += 1
-            if len(line) < 6:
-                continue
-            if line[:6] == '#CHROM':
-                toks = re.split('\s+', line.rstrip())
-                if len(toks) > 8:
-                    self.samples = toks[9:]
-                break
-    
-    def convert_line(self, l):
-        if l.startswith('#'): return None
-        self.var_counter += 1
-        toks = l.strip('\r\n').split('\t')
-        all_wdicts = []
-        if len(toks) < 8:
-            raise BadFormatError('Wrong VCF format')
-        [chrom, pos, tag, ref, alts, qual, filter, info] = toks[:8]
-        if tag == '':
-            raise BadFormatError('ID column is blank')
-        elif tag == '.':
-            tag = 'VAR' + str(self.var_counter)
-        if chrom[:3] != 'chr':
-            chrom = 'chr' + chrom
-        alts = alts.split(',')
-        len_alts = len(alts)
-        if len(toks) == 8:
-            for altno in range(len_alts):
-                wdict = None
-                alt = alts[altno]
-                newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt)
-                wdict = {'tags':tag,
-                         'chrom':chrom,
-                         'pos':newpos,
-                         'ref_base':newref,
-                         'alt_base':newalt,
-                         'sample_id':'no_sample',
-                         'phred': qual,
-                         'filter': filter}
-                all_wdicts.append(wdict)
-        elif len(toks) > 8:
-            sample_datas = toks[9:]
-            genotype_fields = {}
-            genotype_field_no = 0
-            for genotype_field in toks[8].split(':'):
-                genotype_fields[genotype_field] = genotype_field_no
-                genotype_field_no += 1
-            if not ('GT' in genotype_fields):
-                raise BadFormatError('No GT Field')
-            gt_field_no = genotype_fields['GT']
-            for sample_no in range(len(sample_datas)):
-                sample = self.samples[sample_no]
-                sample_data = sample_datas[sample_no].split(':')
-                gts = {}
-                for gt in sample_data[gt_field_no].replace('/', '|').split('|'):
-                    if gt == '.':
-                        continue
-                    else:
-                        gts[int(gt)] = True
-                for gt in sorted(gts.keys()):
-                    wdict = None
-                    if gt == 0:
-                        continue
-                    else:
-                        alt = alts[gt - 1]
-                        newpos, newref, newalt = self.extract_vcf_variant('+', pos, ref, alt)
-                        zyg = self.homo_hetro(sample_data[gt_field_no])
-                        depth, alt_reads, af = self.extract_read_info(sample_data, gt, gts, genotype_fields)
-                            
-                        wdict = {'tags':tag,
-                                 'chrom':chrom,
-                                 'pos':newpos,
-                                 'ref_base':newref,
-                                 'alt_base':newalt,
-                                 'sample_id':sample,
-                                 'phred': qual,
-                                 'filter': filter,
-                                 'zygosity': zyg,
-                                 'tot_reads': depth,
-                                 'alt_reads': alt_reads,
-                                 'af': af,                                
-                                 }
-                        all_wdicts.append(wdict)
-        return all_wdicts
- 
-    #The vcf genotype string has a call for each allele separated by '\' or '/'
-    #If the call is the same for all allels, return 'hom' otherwise 'het'
-    def homo_hetro(self, gt_str):
-        if '.' in gt_str:
-            return '';
-        
-        gts = gt_str.strip().replace('/', '|').split('|')
-        for gt in gts:
-            if gt != gts[0]:
-                return 'het'
-        return 'hom'            
-                        
-    #Extract read depth, allele count, and allele frequency from optional VCR information
-    def extract_read_info (self, sample_data, gt, gts, genotype_fields): 
-        depth = ''
-        alt_reads = ''
-        ref_reads = ''
-        af = ''
-        
-        #AD contains 2 values usually ref count and alt count unless there are 
-        #multiple alts then it will have alt 1 then alt 2.
-        if 'AD' in genotype_fields and genotype_fields['AD'] <= len(sample_data): 
-            if 0 in gts.keys():
-                #if part of the genotype is reference, then AD will have #ref reads, #alt reads
-                ref_reads = sample_data[genotype_fields['AD']].split(',')[0]
-                alt_reads = sample_data[genotype_fields['AD']].split(',')[1]
-            elif gt == max(gts.keys()):    
-                #if geontype has multiple alt bases, then AD will have #alt1 reads, #alt2 reads
-                alt_reads = sample_data[genotype_fields['AD']].split(',')[1]
-            else:
-                alt_reads = sample_data[genotype_fields['AD']].split(',')[0]                            
-                             
-        if 'DP' in genotype_fields and genotype_fields['DP'] <= len(sample_data): 
-            depth = sample_data[genotype_fields['DP']] 
-        elif alt_reads != '' and ref_reads != '':
-            #if DP is not present but we have alt and ref reads count, dp = ref+alt
-            depth = int(alt_reads) + int(ref_reads)   
-
-        if 'AF' in genotype_fields and genotype_fields['AF'] <= len(sample_data):
-            af = float(sample_data[genotype_fields['AF']] )
-        elif depth != '' and alt_reads != '':
-            #if AF not specified, calc it from alt and ref reads
-            af = float(alt_reads) / float(depth)
- 
-        return depth, alt_reads, af
-            
-    def extract_vcf_variant (self, strand, pos, ref, alt):
-
-        reflen = len(ref)
-        altlen = len(alt)
-        
-        # Returns without change if same single nucleotide for ref and alt. 
-        if reflen == 1 and altlen == 1 and ref == alt:
-            return pos, ref, alt
-        
-        # Trimming from the start and then the end of the sequence 
-        # where the sequences overlap with the same nucleotides
-        new_ref2, new_alt2, new_pos = \
-            self.trimming_vcf_input(ref, alt, pos, strand)
-                
-        if new_ref2 == '':
-            new_ref2 = '-'
-        if new_alt2 == '':
-            new_alt2 = '-'
-        
-        return new_pos, new_ref2, new_alt2
-    
-    # This function looks at the ref and alt sequences and removes 
-    # where the overlapping sequences contain the same nucleotide.
-    # This trims from the end first but does not remove the first nucleotide 
-    # because based on the format of VCF input the 
-    # first nucleotide of the ref and alt sequence occur 
-    # at the position specified.
-    #     End removed first, not the first nucleotide
-    #     Front removed and position changed
-    def trimming_vcf_input(self, ref, alt, pos, strand):
-        pos = int(pos)
-        reflen = len(ref)
-        altlen = len(alt)
-        minlen = min(reflen, altlen)
-        new_ref = ref
-        new_alt = alt
-        new_pos = pos
-        # Trims from the end. Except don't remove the first nucleotide. 
-        # 1:6530968 CTCA -> GTCTCA becomes C -> GTC.
-        for nt_pos in range(0, minlen - 1): 
-            if ref[reflen - nt_pos - 1] == alt[altlen - nt_pos - 1]:
-                new_ref = ref[:reflen - nt_pos - 1]
-                new_alt = alt[:altlen - nt_pos - 1]
-            else:
-                break    
-        new_ref_len = len(new_ref)
-        new_alt_len = len(new_alt)
-        minlen = min(new_ref_len, new_alt_len)
-        new_ref2 = new_ref
-        new_alt2 = new_alt
-        # Trims from the start. 1:6530968 G -> GT becomes 1:6530969 - -> T.
-        for nt_pos in range(0, minlen):
-            if new_ref[nt_pos] == new_alt[nt_pos]:
-                if strand == '+':
-                    new_pos += 1
-                elif strand == '-':
-                    new_pos -= 1
-                new_ref2 = new_ref[nt_pos + 1:]
-                new_alt2 = new_alt[nt_pos + 1:]
-            else:
-                new_ref2 = new_ref[nt_pos:]
-                new_alt2 = new_alt[nt_pos:]
-                break  
-        return new_ref2, new_alt2, new_pos
-
-
-if __name__ == "__main__":
-    c = CravatConverter()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat_submit/Z_Additional_Details2018-08-16_15-00-12_.tsv	Thu Aug 16 15:10:07 2018 -0400
@@ -0,0 +1,16 @@
+#Variant Additional Details Report
+#2018-08-16 19:00:07.742990
+#CRAVAT version: hybrid
+#Analysis done at http://www.cravat.us.
+#Job Id: cravatgalaxy_20180816_145959
+#Input file: converted.txt
+#This report shows analysis results at variant level.
+#hg38 genomic.
+#N/A
+#For more information on CRAVAT, visit http://www.cravat.us.
+
+Input line	ID	Chromosome	Position	Strand	Reference base(s)	Alternate base(s)	Sample ID	HUGO symbol	Sequence ontology	S.O. transcript	S.O. transcript strand	Protein sequence change	S.O. all transcripts	CGC driver class	CGC inheritance	CGC tumor types somatic	CGC tumor types germline	ClinVar disease identifier	ClinVar XRef	COSMIC transcript	COSMIC protein change	COSMIC variant count	ESP6500 AF (European American)	ESP6500 AF (African American)	HGVS Genomic	HGVS Protein	HGVS Protein All	NCI pathway hits	NCI pathway IDs	NCI pathway names	VEST score transcript	VEST p-value	VEST score (missense)	VEST score (frameshift indels)	VEST score (inframe indels)	VEST score (stop-gain)	VEST score (stop-loss)	VEST score (splice site)	All transcripts VEST results
+1	TR1	chr22	30025797	+	A	T	sample_1	MTMR3	MS	ENST00000401950.6	+	N1198I	ENST00000323630.9:N1062I(MS),ENST00000351488.7:N1161I(MS),ENST00000333027.7:N1170I(MS),ENST00000406629.1:N1170I(MS),*ENST00000401950.6:N1198I(MS)										0	0.0226963	NC_000022.10:g.30025797A>T	ENST00000401950.6:p.Asn1198Ile	ENST00000323630.9:p.Asn1062Ile,ENST00000351488.7:p.Asn1161Ile,ENST00000333027.7:p.Asn1170Ile,ENST00000406629.1:p.Asn1170Ile,*ENST00000401950.6:p.Asn1198Ile	0			ENST00000323630.9:N1062I	0.14523	0.569						ENST00000401950.6:N1198I(0.5:0.18672),ENST00000333027.7:N1170I(0.475:0.19897),*ENST00000323630.9:N1062I(0.569:0.14523),ENST00000406629.1:N1170I(0.466:0.20362),ENST00000351488.7:N1161I(0.482:0.19512)
+2	TR2	chr22	40418496	-	A	G	sample_1	MKL1	MS	ENST00000618196.4	-	S683G	ENST00000396617.7:S648G(MS),ENST00000614754.4:S649G(MS),ENST00000402042.5:S598G(MS),ENST00000620651.4:S599G(MS),ENST00000407029.5:S648G(MS),ENST00000355630.7:S648G(MS),*ENST00000618196.4:S683G(MS)		somatic	acute megakaryocytic leukaemia				ENST00000355630	p.S648G (stomach 1)	1	0.396977	0.860645	NC_000022.10:g.40418496T>C	ENST00000618196.4:p.Ser683Gly	ENST00000396617.7:p.Ser648Gly,ENST00000614754.4:p.Ser649Gly,ENST00000402042.5:p.Ser598Gly,ENST00000620651.4:p.Ser599Gly,ENST00000407029.5:p.Ser648Gly,ENST00000355630.7:p.Ser648Gly,*ENST00000618196.4:p.Ser683Gly	1	5d9755d6-5521-11e7-8f50-0ac135e8bacf	RhoA signaling pathway	ENST00000620651.4:S599G	0.77158	0.104						ENST00000396617.7:S648G(0.057:0.92622),ENST00000355630.7:S648G(0.04:0.96174),ENST00000407029.5:S648G(0.043:0.95567),*ENST00000620651.4:S599G(0.104:0.77158),ENST00000618196.4:S683G(0.037:0.9662),ENST00000614754.4:S649G(0.062:0.91296),ENST00000402042.5:S598G(0.093:0.80893)
+3	TR3	chr22	40419252	+	C	T	sample_1	MKL1	MS	ENST00000618196.4	-	A431T	ENST00000396617.7:A396T(MS),ENST00000614754.4:A397T(MS),ENST00000402042.5:A346T(MS),ENST00000620651.4:A347T(MS),ENST00000407029.5:A396T(MS),ENST00000355630.7:A396T(MS),*ENST00000618196.4:A431T(MS)		somatic	acute megakaryocytic leukaemia							0.00127937	0.0834091	NC_000022.10:g.40419252C>T	ENST00000618196.4:p.Ala431Thr	ENST00000396617.7:p.Ala396Thr,ENST00000614754.4:p.Ala397Thr,ENST00000402042.5:p.Ala346Thr,ENST00000620651.4:p.Ala347Thr,ENST00000407029.5:p.Ala396Thr,ENST00000355630.7:p.Ala396Thr,*ENST00000618196.4:p.Ala431Thr	1	5d9755d6-5521-11e7-8f50-0ac135e8bacf	RhoA signaling pathway	ENST00000620651.4:A347T	0.80893	0.093						ENST00000396617.7:A396T(0.039:0.96377),ENST00000355630.7:A396T(0.039:0.96377),ENST00000407029.5:A396T(0.033:0.97116),*ENST00000620651.4:A347T(0.093:0.80893),ENST00000618196.4:A431T(0.034:0.97045),ENST00000614754.4:A397T(0.029:0.97723),ENST00000402042.5:A346T(0.042:0.95749)
+4	TR4	chr22	24719483	+	-	AGG	sample_2	PIWIL3	II	ENST00000332271.9	-	F880SF	ENST00000533313.5:F762SF(II),ENST00000527701.5:F762SF(II),*ENST00000332271.9:F880SF(II),ENST00000616349.4:F880SF(II)										0	0	NC_000022.10:g.24719482_24719483insAGG	ENST00000332271.9:p.Leu879_Phe880insSer	ENST00000533313.5:p.Leu761_Phe762insSer,ENST00000527701.5:p.Leu761_Phe762insSer,*ENST00000332271.9:p.Leu879_Phe880insSer,ENST00000616349.4:p.Leu879_Phe880insSer	0			ENST00000533313.5:762	0.47384			0.14				*ENST00000533313.5:762(0.14:0.47384),ENST00000332271.9:880(0.12:0.5116),ENST00000527701.5:762(0.14:0.47384),ENST00000616349.4:880(0.11:0.53414)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat_submit/Z_Variant_Result2018-08-16_15-00-12_.tsv	Thu Aug 16 15:10:07 2018 -0400
@@ -0,0 +1,16 @@
+#Variant Report
+#2018-08-16 19:00:07.740432
+#CRAVAT version: hybrid
+#Analysis done at http://www.cravat.us.
+#Job Id: cravatgalaxy_20180816_145959
+#Input file: converted.txt
+#This report shows analysis results at variant level.
+#hg38 genomic.
+#N/A
+#For more information on CRAVAT, visit http://www.cravat.us.
+
+Input line	ID	Chromosome	Position	Strand	Reference base(s)	Alternate base(s)	Sample ID	HUGO symbol	Sequence ontology	Protein sequence change	ClinVar	COSMIC ID	COSMIC variant count (tissue)	Number of samples with variant	dbSNP	ESP6500 AF (average)	gnomAD AF Total	gnomAD AF African	gnomAD AF American	gnomAD AF Ashkenazi Jewish	gnomAD AF East Asian	gnomAD AF Finnish	gnomAD AF Non-Finnish European	gnomAD AF Other	gnomAD AF South Asian	GWAS NHLBI Key (GRASP)	GWAS PMID (GRASP)	GWAS Phenotype (GRASP)	Protein 3D variant	In TCGA Mutation Cluster	ncRNA Class	ncRNA Name	Pseudogene	Pseudogene Transcript	Repeat Class	Repeat Family	Repeat Name	TARGET	1000 Genomes AF	UTR/Intron	UTR/Intron Gene	UTR/Intron All Transcript	VEST p-value	VEST FDR	CGL driver class
+1	TR1	chr22	30025797	+	A	T	sample_1	MTMR3	MS	N1198I				1	rs75623810	0.01134815	0.00229508433352	0.0250541035459	0.000900691498634				1.57942951006e-05	0.00015475085113															0.00778754				0.14523		
+2	TR2	chr22	40418496	-	A	G	sample_1	MKL1	MS	S683G		COSM149301	stomach(1)	1	rs878756	0.628811																							0.483227	intron	MKL1	ENST00000618417.1(intron)	0.77158		
+3	TR3	chr22	40419252	+	C	T	sample_1	MKL1	MS	A431T				1	rs34736200	0.042344235	0.00952521054761	0.092516658124	0.00484894792524	0.00298626318933	0.00198817839871		0.000953211839214	0.00409191060749	0.00158217417101				../MuPIT_Interactive?gm=chr22:40419252										0.0279553	intron	MKL1	ENST00000618417.1(intron)	0.80893		
+4	TR4	chr22	24719483	+	-	AGG	sample_2	PIWIL3	II	F880SF				1		0.0													../MuPIT_Interactive?gm=chr22:24719483										0				0.47384		
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat_submit/cravat_submit.py	Thu Aug 16 15:10:07 2018 -0400
@@ -0,0 +1,175 @@
+from __future__ import print_function
+import requests
+import json
+import time
+try:
+    # Python 3.0+
+    from urllib.request import urlretrieve
+except:
+    # Python 2.7
+    from urllib import urlretrieve
+import sys
+import csv
+import argparse
+
+"""
+Tool's email:
+usernmae: cravatgalaxy@gmail.com
+password: chicken_quesadilla
+"""
+
+email = 'cravatgalaxy@gmail.com'
+
+class CravatSubmissionException(Exception):
+    def __init__(self, message):
+        super(CravatSubmissionException, self).__init__(message)
+
+class CravatSubmission(object):
+
+    def get_cmd_args(self, argv):
+        parser = argparse.ArgumentParser()
+        parser.add_argument('path',
+                                help="Path to python module")
+        parser.add_argument('--input',
+                                '-i',
+                                required = True,
+                                help='Input path to a cravat file for querying',)
+        parser.add_argument('--output',
+                                '-o',
+                                default = None,
+                                help = 'Output path to write results from query')
+        parser.add_argument('--analysis',
+                                '-a',
+                                required=True,
+                                help = "Cravat analysis. Should be 'VEST', 'CHASM', 'NONE', or 'VEST;CHASM'")
+        return parser.parse_args(argv)
+
+    def is_valid_analysis(self, analysis):
+        """: Test if analysis is a recognized value"""
+        analyses = ["VEST", "CHASM", "VEST;CHASM", ""]
+        return analysis in analyses
+
+    def is_skippable(self, s):
+        """: Test if a line (str or list/tuple) is skippable, a.k.a. a header or blank line"""
+        if not isinstance(s, str):
+            raise CravatSubmissionException("is_skippable accepts a string")
+        skippable = s == "" \
+            or s[0] == "#" \
+            or s.startswith('"#For more information on CRAVAT') \
+            or s.isspace()
+        return skippable
+
+    def parse(self, s, sep='\t'):
+        """: Convert string line to an array of values"""
+        return s.strip().split(sep)
+
+    def unparse(self, array, sep='\t', newline='\n'):
+        """: Convert an array of values to a writable string line"""
+        return sep.join([str(i) for i in array]) + newline
+
+    def get_headers(self, path, pattern='Input line', sep='\t'):
+        """: Get the headers from a Results/Details file obtained from by a finished Cravat submission"""
+        with open(path, 'r') as f:
+            for line in f:
+                if line.startswith(pattern):
+                    return self.parse(line)
+            return None
+
+    def create_index(self, path, prop='Input line'):
+        """
+        : Create an index of seek/tell positions in file associated to a line value. Used to record
+        : the location of lines betwen two files that are associated with each other without reading entire
+        : files into memory.
+        """
+        headers = self.get_headers(path)
+        if prop not in headers:
+            raise CravatSubmissionException("Index retrievel property not found in headers")
+        prop_loc = headers.index(prop)
+        index = {}
+        with open(path, 'r') as f:
+            pos = 0
+            line = f.readline()
+            while line != "":
+                if not self.is_skippable(line):
+                    parsed = self.parse(line)
+                    if not parsed == headers:
+                        index[parsed[prop_loc]] = pos
+                pos = f.tell()
+                line = f.readline()
+        return index
+
+    def get_header_val_dict(self, headers, vals):
+        """: Associate an array of header keys to an array of values."""
+        return { header:val for (header, val) in zip(headers, vals) }
+
+    def write_results(self, results_path, details_path, out_path, write_headers=True):
+        """
+        : Using the paths to the Results and Details file from a Cravat Sumbission,
+        : write the output file.
+        """
+        results_headers = self.get_headers(results_path)
+        details_headers = self.get_headers(details_path)
+        if results_headers == None \
+        or details_headers == None:
+            raise CravatSubmissionException("Unable to intepret headers in Results or Details submission files")
+        headers = results_headers
+        headers.extend(filter(lambda x: x not in headers, details_headers))
+        results_index = self.create_index(results_path)
+        details_index = self.create_index(details_path)
+        with open(results_path, 'r') as results_file, \
+        open(details_path, 'r') as details_file, \
+        open(out_path, 'w') as out_file:
+            if write_headers:
+                out_file.write(self.unparse(headers))
+            for line_id, file_pos in results_index.items():
+                results_file.seek(file_pos)
+                results_vals = self.parse(results_file.readline())
+                results_dict = self.get_header_val_dict(results_headers, results_vals)
+                if line_id in details_index:
+                    details_file.seek(details_index[line_id])
+                    details_vals = self.parse(details_file.readline())
+                    details_dict = self.get_header_val_dict(details_headers, details_vals)
+                    # On a repeated entry, the Details value will overwrite Results value
+                    results_dict.update(details_dict)
+                line = [ results_dict.get(header, 'None') for header in headers ]
+                out_file.write(self.unparse(line))
+                
+    def submit(self, in_path, analysis):
+        """: Make a POST request to submit a job to production CRAVAT server."""
+        if not self.is_valid_analysis(analysis):
+            raise ValueError("Did not get valid analyses.")
+        # Create post request to submit job to  CRAVAT production server
+        submit = requests.post('http://cravat.us/CRAVAT/rest/service/submit',
+                                files={'inputfile' : open(in_path)},
+                                data={'email' : email,
+                                'analyses' : analysis})
+        # Check job run status in loop until status is 'Success'
+        jobid = json.loads(submit.text)['jobid']
+        while True:
+            check = requests.get('http://cravat.us/CRAVAT/rest/service/status', params={'jobid': jobid})
+            status = json.loads(check.text)['status']
+            #print(status)
+            if status == 'Success':
+                break
+            else:
+                time.sleep(2)
+        # Download completed job results to local files
+        timestamp = time.strftime("%Y-%m-%d_%H-%M-%S_")
+        results_path = 'Z_Variant_Result' + timestamp + '.tsv'
+        details_path = 'Z_Additional_Details' + timestamp + '.tsv'
+        urlretrieve("http://cravat.us/CRAVAT/results/" + jobid + "/" + "Variant.Result.tsv",
+            filename=results_path)
+        urlretrieve("http://cravat.us/CRAVAT/results/" + jobid + "/" + "Variant_Additional_Details.Result.tsv",
+            filename=details_path)
+        return results_path, details_path
+
+if __name__ == "__main__":
+    submission = CravatSubmission()
+    cmd_args = submission.get_cmd_args(sys.argv)
+    # Galaxy converts semi-colons to X's. Switch it back
+    analysis = cmd_args.analysis
+    if analysis == "VESTXCHASM":
+        analysis = "VEST;CHASM"
+    results_path, details_path = submission.submit(cmd_args.input, analysis)
+    #submission.write_results('Results_test.tsv', 'Details_test.tsv', 'Out_test.tsv')
+    submission.write_results(results_path, details_path, cmd_args.output)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat_submit/cravat_submit.xml	Thu Aug 16 15:10:07 2018 -0400
@@ -0,0 +1,34 @@
+<tool id="cravat_submit" name="CRAVAT Submit, Check, and Retrieve" version="0.1.0">
+    <description>Submits, checks for, and retrieves data for cancer annotation</description>
+  <command interpreter="python">cravat_submit.py --input $input --output $output --analysis $dropdown</command>
+  
+  
+  <inputs>
+  
+    <param format="tabular" name="input" type="data" label="Source file"> </param>
+    <param format="tabular" name="dropdown" type="select" label="Analysis Program">
+      <option value="None">None</option>
+      <option value="VEST">VEST</option>
+      <option value="CHASM">CHASM</option>
+      <option value="VEST;CHASM">VEST and CHASM</option>
+    </param>
+    
+    
+  </inputs>
+  
+  <outputs>
+    <data format="tabular" name="output" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="fa_gc_content_input.fa"/>
+      <output name="out_file1" file="fa_gc_content_output.txt"/>
+    </test>
+  </tests>
+
+  <help>
+ This tool submits, checks for, and retrieves data for cancer annotation.
+  </help>
+
+</tool>