Next changeset 1:80eff5812b2a (2014-11-06) |
Commit message:
Uploaded |
added:
snpeff_to_peptides.py snpeff_to_peptides.xml test-data/all_pep.fa test-data/all_pep.tabular test-data/peptides_10_10.fa test-data/snpeff.vcf |
b |
diff -r 000000000000 -r fcb7188fa0d2 snpeff_to_peptides.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpeff_to_peptides.py Fri Feb 07 15:05:20 2014 -0500 |
[ |
b'@@ -0,0 +1,239 @@\n+#!/usr/bin/env python\n+"""\n+#\n+#------------------------------------------------------------------------------\n+# University of Minnesota\n+# Copyright 2013, Regents of the University of Minnesota\n+#------------------------------------------------------------------------------\n+# Author:\n+#\n+# James E Johnson\n+#\n+#------------------------------------------------------------------------------\n+"""\n+\n+\n+"""\n+This tool takes a SnpEff VCF file and an Ensembl pep.all.fa file ( e.g. Homo_sapiens.GRCh37.73.pep.all.fa )\n+It outputs a peptide fasta file with the variant peptide sequence that result from NON_SYNONYMOUS_CODING effects \n+\n+"""\n+\n+import sys,re,os.path\n+import tempfile\n+import optparse\n+from optparse import OptionParser\n+import logging\n+\n+## dictionary for Amino Acid Abbreviations\n+aa_abbrev_dict = dict()\n+aa_abbrev_dict[\'Phe\'] = \'F\'\n+aa_abbrev_dict[\'Leu\'] = \'L\'\n+aa_abbrev_dict[\'Ser\'] = \'S\'\n+aa_abbrev_dict[\'Tyr\'] = \'Y\'\n+aa_abbrev_dict[\'Cys\'] = \'C\'\n+aa_abbrev_dict[\'Trp\'] = \'W\'\n+aa_abbrev_dict[\'Pro\'] = \'P\'\n+aa_abbrev_dict[\'His\'] = \'H\'\n+aa_abbrev_dict[\'Gln\'] = \'Q\'\n+aa_abbrev_dict[\'Arg\'] = \'R\'\n+aa_abbrev_dict[\'Ile\'] = \'I\'\n+aa_abbrev_dict[\'Met\'] = \'M\'\n+aa_abbrev_dict[\'Thr\'] = \'T\'\n+aa_abbrev_dict[\'Asn\'] = \'N\'\n+aa_abbrev_dict[\'Lys\'] = \'K\'\n+aa_abbrev_dict[\'Val\'] = \'V\'\n+aa_abbrev_dict[\'Ala\'] = \'A\'\n+aa_abbrev_dict[\'Asp\'] = \'D\'\n+aa_abbrev_dict[\'Glu\'] = \'E\'\n+aa_abbrev_dict[\'Gly\'] = \'G\'\n+\n+## Get the peptide ID and sequence a given ID \n+def get_sequence(id,seq_file):\n+ fh = open(seq_file, \'r\')\n+ try:\n+ for (ln,line) in enumerate(fh):\n+ if line.find(id) >= 0:\n+ fields = line.split(\'\\t\')\n+ return ( \' \'.join(fields[0:-1]),fields[-1].rstrip() if fields and len(fields) > 0 else None )\n+ except Exception, e:\n+ print >> sys.stderr, "failed: %s" % e\n+ finally:\n+ fh.close()\n+\n+def fasta_to_tabular(fasta_file,tabular_file):\n+ inFile = open(fasta_file,\'r\')\n+ outFile = open(tabular_file,\'w\') \n+ for i, line in enumerate( inFile ):\n+ line = line.rstrip( \'\\r\\n\' )\n+ if not line or line.startswith( \'#\' ):\n+ continue\n+ if line.startswith( \'>\' ):\n+ #Don\'t want any existing tabs to trigger extra columns:\n+ line = line.replace(\'\\t\', \' \')\n+ if i > 0:\n+ outFile.write(\'\\n\')\n+ outFile.write(line[1:])\n+ outFile.write(\'\\t\')\n+ else:\n+ outFile.write(line)\n+ if i > 0:\n+ outFile.write(\'\\n\')\n+ if inFile:\n+ inFile.close()\n+ if outFile:\n+ outFile.close()\n+\n+def __main__():\n+ #Parse Command Line\n+ parser = optparse.OptionParser()\n+ parser.add_option( \'-i\', \'--input\', dest=\'input\', help=\'The input snpeff vcf file with HGVS annotations (else read from stdin)\' )\n+ parser.add_option( \'-o\', \'--output\', dest=\'output\', help=\'The output fasta (else write to stdout)\' )\n+ parser.add_option( \'-p\', \'--protein_fasta\', dest=\'protein_fasta\', default=None, help=\'The Esembl protein fasta in tabular format\' )\n+ parser.add_option( \'-l\', \'--leading_aa_num\', dest=\'leading_aa_num\', type=\'int\', default=None, help=\'leading number of AAs to output\' )\n+ parser.add_option( \'-t\', \'--trailing_aa_num\', dest=\'trailing_aa_num\', type=\'int\', default=None, help=\'trailing number of AAs to output\' )\n+ parser.add_option( \'-d\', \'--debug\', dest=\'debug\', action=\'store_true\', default=False, help=\'Turn on wrapper debugging to stdout\' )\n+ (options, args) = parser.parse_args()\n+\n+ # need protein_fasta file\n+ fastaFile = options.protein_fasta\n+ if options.protein_fasta == None:\n+ print >> sys.stderr, "Ensembl protein_fasta tabular file required"\n+ exit(4)\n+ else:\n+ # determine if fasta is already in tabular format\n+ is_tabular = False\n+ standard_aa = \'^[AC-IK-WY]+$\'\n+ standard_na = \'^[ACGTN]+$\'\n+ inFile = open(fastaFile,\'r\')\n+ try:\n+ nseq = 0\n+ for i, line in enumerate( inFile ):\n+ line = line.rstrip( \'\\r\\n\' )\n+ if not line or line.startswith( \'#\' ):\n+ continue\n+ fields = line.split(\'\\t\''..b'tswith(\'##\'):\n+ vcf_header.append(line)\n+ # May need to check SnpEff version in the header, the EFF info changed between versions 2 and 3\n+ ##SnpEffVersion\n+ elif line.startswith(\'#CHROM\'):\n+ reading_entries = True\n+ else:\n+ fields = line.split(\'\\t\')\n+ # This is the current format of the EFF entry:\n+ # EFF=missense(MODERATE|MISSENSE|Ggg/Cgg|G528R|802|SCNN1D|protein_coding|CODING|ENST00000379116|12|1);OICR=(ENST00000379116|1808) \n+ # If this becomes variable, will need to dynamically pattern this on the defintion in the vcf header:\n+ ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: \'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )\' ">\n+ (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]\n+ for info_item in info.split(\';\'):\n+ try:\n+ if info_item.find(\'=\') < 0:\n+ continue\n+ (key,val) = info_item.split(\'=\',1)\n+ if key == \'EFF\':\n+ effects = val.split(\',\')\n+ for effect in effects:\n+ (eff,effs) = effect.rstrip(\')\').split(\'(\')\n+ if not eff == \'NON_SYNONYMOUS_CODING\':\n+ continue\n+ eff_fields = effs.split(\'|\')\n+ (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]\n+ if transcript:\n+ aa_pos = None # 1-based position\n+ alt_aa = \'_\' \n+ # parse aa_change\n+ # get AA change position and alternate Animo Acid\n+ sap = aa_change\n+ m = re.match(aa_change_regex,aa_change)\n+ if m:\n+ aa_pos = int(m.groups()[1])\n+ alt_aa = m.groups()[2]\n+ else:\n+ m = re.match(aa_hgvs_regex,aa_change)\n+ if m:\n+ aa_pos = int(m.groups()[1])\n+ ref_aa = aa_abbrev_dict[m.groups()[0]]\n+ alt_aa = aa_abbrev_dict[m.groups()[2]]\n+ sap = "%s%d%s" % (ref_aa,aa_pos,alt_aa)\n+ if not aa_pos:\n+ continue\n+ # get AA sequence\n+ aa_offset = aa_pos - 1\n+ (pep_id,pep_seq) = get_sequence(transcript,fastaFile)\n+ if not pep_seq:\n+ continue\n+ start_pos = max(aa_offset - options.leading_aa_num, 0) if options.leading_aa_num else 0\n+ end_pos = min(aa_offset + options.trailing_aa_num + 1, len(pep_seq)) if options.trailing_aa_num else len(pep_seq)\n+ # transform sequence\n+ alt_seq = pep_seq[start_pos:aa_offset] + alt_aa + pep_seq[aa_offset+1:end_pos]\n+ # >ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:1:22778472 codon_change:Gtg/Atg sap:V885M\n+ pep_id = re.sub(\'pep:[a-z]*\',\'pep:sap\',pep_id)\n+ hdr = ">%s snp_location:%s:%s codon_change:%s sap:%s\\n" % (pep_id, chrom, pos, codon_change, sap)\n+ outputFile.write(hdr)\n+ if options.debug:\n+ trimmed_seq = pep_seq[start_pos:end_pos]\n+ outputFile.write(trimmed_seq)\n+ outputFile.write(\'\\n\')\n+ outputFile.write(alt_seq)\n+ outputFile.write(\'\\n\')\n+ except Exception, e:\n+ print >> sys.stderr, "failed: %s" % e\n+ except Exception, e:\n+ print >> sys.stderr, "failed: %s" % e\n+ exit(1)\n+\n+if __name__ == "__main__" : __main__()\n+\n' |
b |
diff -r 000000000000 -r fcb7188fa0d2 snpeff_to_peptides.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpeff_to_peptides.xml Fri Feb 07 15:05:20 2014 -0500 |
[ |
@@ -0,0 +1,76 @@ +<?xml version="1.0"?> +<tool id="snpeff_to_peptides" name="SnpEff to Peptide fasta" version="0.0.1"> + <description> to create a Search DB fasta for variant SAP peptides</description> + <command interpreter="python">snpeff_to_peptides.py --input "$snpeff_vcf" --protein_fasta "$all_pep_fasta" --output "$peptide_variant_fasta" + #if $leading_aa_num: + --leading_aa_num $leading_aa_num + #end if + #if $trailing_aa_num: + --trailing_aa_num $trailing_aa_num + #end if + </command> + <inputs> + <param name="snpeff_vcf" type="data" format="vcf" label="SnpEff generated VCF file with NON_SYNONYMOUS_CODING annotations"/> + <param name="all_pep_fasta" type="data" format="fasta,tabular" label="Ensembl all_pep.fa" + help="An Ensembl all_pep.fa file corresponding to the genome build used for SnpEff (May be converted to tabular fasta format)"/> + <param name="leading_aa_num" type="integer" value="30" min="0" optional="true" label="Preceeding AAs" + help="The number of Amino Acids to include before the variant position (leave blank to include all)"/> + <param name="trailing_aa_num" type="integer" value="30" min="0" optional="true" label="Following AAs" + help="The number of Amino Acids to include after the variant position (leave blank to include all)"/> + </inputs> + <stdio> + <exit_code range="1:" level="fatal" description="Error" /> + </stdio> + <outputs> + <data name="peptide_variant_fasta" metadata_source="all_pep_fasta" format="fasta"/> + </outputs> + <tests> + <test> + <param name="snpeff_vcf" value="snpeff.vcf" ftype="vcf" dbkey="hg19"/> + <param name="all_pep_fasta" value="all_pep.fa" ftype="fasta" dbkey="hg19"/> + <param name="leading_aa_num" value="10"/> + <param name="trailing_aa_num" value="10"/> + <output name="peptide_variant_fasta" file="peptides_10_10.fa"/> + </test> + <test> + <param name="snpeff_vcf" value="snpeff.vcf" ftype="vcf" dbkey="hg19"/> + <param name="all_pep_fasta" value="all_pep.tabular" ftype="tabular" dbkey="hg19"/> + <param name="leading_aa_num" value="10"/> + <param name="trailing_aa_num" value="10"/> + <output name="peptide_variant_fasta" file="peptides_10_10.fa"/> + </test> + </tests> + <help> +**SnpEff to Peptide Fasta** + +This generates a fasta file of peptide sequences with SAPs ( Single Amino acid Polymorphisms ) +from the NON_SYNONYMOUS_CODING EFF annnotations from the SnpEff_ application. +The SnpEff VCF may be filtered or annotated using SnpSift. + +The following is appended to the fasta ID line: snp_location:chr:position codon_change:nnn/nnn sap:AposA + +For VCF entry:: + + chr1 22846709 . G A 9.31 . DP=2;VDB=0.0174;AF1=1;AC1=2;DP4=0,0,1,1;MQ=20;FQ=-33;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Gtg/Atg|V885M|1127|ZBTB40|protein_coding|CODING|ENST00000374651|12|1) PL 40,6,0 + +The peptide fasta entry that matches transcript ID: ENST00000374651 would be:: + + >ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding + +The ID of the output peptide fasta ID would be:: + + >ENSP00000363782 pep:sap chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:22846709 codon_change:Gtg/Atg sap:V885M + + +.. _SnpEff: http://snpeff.sourceforge.net/index.html + +**Citation** + +SnpEff citation: +"A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3.", Cingolani P, Platts A, Wang le L, Coon M, Nguyen T, Wang L, Land SJ, Lu X, Ruden DM. Fly (Austin). 2012 Apr-Jun;6(2):80-92. PMID: 22728672 [PubMed - in process] + +SnpSift citation: +"Using Drosophila melanogaster as a model for genotoxic chemical mutational studies with a new program, SnpSift", Cingolani, P., et. al., Frontiers in Genetics, 3, 2012. + + </help> +</tool> |
b |
diff -r 000000000000 -r fcb7188fa0d2 test-data/all_pep.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_pep.fa Fri Feb 07 15:05:20 2014 -0500 |
b |
@@ -0,0 +1,47 @@ +>ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding +MELPNYSRQLLQQLYTLCKEQQFCDCTISIGTIYFRAHKLVLAAASLLFKTLLDNTDTISIDASVVSPEE +FALLLEMMYTGKLPVGKHNFSKIISLADSLQMFDVAVSCKNLLTSLVNCSVQGQVVRDVSAPSSETFRKE +PEKPQVEILSSEGAGEPHSSPELAATPGGPVKAETEEAAHSVSQEMSVNSPTAQESQRNAETPAETPTTA +EACSPSPAVQTFSEAKKTSTEPGCERKHYQLNFLLENEGVFSDALMVTQDVLKKLEMCSEIKGPQKEVIL +NCCEGRTPKETIENLLHRMTEEKTLTAEGLVKLLQAVKTTFPNLGLLLEKLQKSATLPSTTVQPSPDDYG +TELLRRYHENLSEIFTDNQILLKMISHMTSLAPGEREVMEKLVKRDSGSGGFNSLISAVLEKQTLSATAI +WQLLLVVQETKTCPLDLLMEEIRREPGADAFFRAVTTPEHATLETILRHNQLILEAIQQKIEYKLFTSEE +EHLAETVKEILSIPSETASPEASLRAVLSRAMEKSVPAIEICHLLCSVHKSFPGLQPVMQELAYIGVLTK +EDGEKETWKVSNKFHLEANNKEDEKAAKEDSQPGEQNDQGETGSLPGQQEKEASASPDPAKKSFICKACD +KSFHFYCRLKVHMKRCRVAKSKQVQCKECSETKDSKKELDKHQLEAHGAGGEPDAPKKKKKRLPVTCDLC +GREFAHASGMQYHKLTEHFDEKPFSCEECGAKFAANSTLKNHLRLHTGDRPFMCKHCLMTFTQASALAYH +TKKKHSEGKMYACQYCDAVFAQSIELSRHVRTHTGDKPYVCRDCGKGFRQANGLSIHLHTFHNIEDPYDC +KKCRMSFPTLQDHRKHIHEVHSKEYHPCPTCGKIFSAPSMLERHVVTHVGGKPFSCGICNKAYQQLSGLW +YHNRTHHPDVFAAQNHRSSKFSSLQCSSCDKTFPNTIEHKKHIKAEHADMKFHECDQCKELFPTPALLQV +HVKCQHSGSQPFRCLYCAATFRFPGALQHHVTTEHFKQSETTFPCELCGELFTSQAQLDSHLESEHPKVM +STETQAAASQMAQVIQTPEPVAPTEQVITLEETQLAGSQVFVTLPDSQASQASSELVAVTVEDLLDGTVT +LICGEAK +>ENSP00000346634 pep:known chromosome:GRCh37:1:36690017:36770958:1 gene:ENSG00000054118 transcript:ENST00000354618 gene_biotype:protein_coding transcript_biotype:protein_coding +MSKTNKSKSGSRSSRSRSASRSRSRSFSKSRSRSRSLSRSRKRRLSSRSRSRSYSPAHNRERNHPRVYQN +RDFRGHNRGYRRPYYFRGRNRGFYPWGQYNRGGYGNYRSNWQNYRQAYSPRRGRSRSRSPKRRSPSPRSR +SHSRNSDKSSSDRSRRSSSSRSSSNHSRVESSKRKSAKEKKSSSKDSRPSQAAGDNQGDEAKEQTFSGGT +SQDTKASESSKPWPDATYGTGSASRASAVSELSPRERSPALKSPLQSVVVRRRSPRPSPVPKPSPPLSST +SQMGSTLPSGAGYQSGTHQGQFDHGSGSLSPSKKSPVGKSPPSTGSTYGSSQKEESAASGGAAYTKRYLE +EQKTENGKDKEQKQTNTDKEKIKEKGSFSDTGLGDGKMKSDSFAPKTDSEKPFRGSQSPKRYKLRDDFEK +KMADFHKEEMDDQDKDKAKGRKESEFDDEPKFMSKVIGANKNQEEEKSGKWEGLVYAPPGKEKQRKTEEL +EEESFPERSKKEDRGKRSEGGHRGFVPEKNFRVTAYKAVQEKSSSPPPRKTSESRDKLGAKGDFPTGKSS +FSITREAQVNVRMDSFDEDLARPSGLLAQERKLCRDLVHSNKKEQEFRSIFQHIQSAQSQRSPSELFAQH +IVTIVHHVKEHHFGSSGMTLHERFTKYLKRGTEQEAAKNKKSPEIHRRIDISPSTFRKHGLAHDEMKSPR +EPGYKAEGKYKDDPVDLRLDIERRKKHKERDLKRGKSRESVDSRDSSHSRERSAEKTEKTHKGSKKQKKH +RRARDRSRSSSSSSQSSHSYKAEEYTEETEEREESTTGFDKSRLGTKDFVGPSERGGGRARGTFQFRARG +RGWGRGNYSGNNNNNSNNDFQKRNREEEWDPEYTPKSKKYYLHDDREGEGSDKWVSRGRGRGAFPRGRGR +FMFRKSSTSPKWAHDKFSGEEGEIEDDESGTENREEKDNIQPTTE +>ENSP00000318415 pep:known chromosome:GRCh37:1:89445139:89458455:-1 gene:ENSG00000213516 transcript:ENST00000321792 gene_biotype:protein_coding transcript_biotype:protein_coding +MVEADRPGKLFIGGLNTETNEKALETVFGKYGRIVEVLLIKDRETNKSRGFAFVTFESPADAKDAARDMN +GKSLDGKAIKVEQATKPSFERGRHGPPPPPRSRGPPRGFGAGRGGSGGTRGPPSRGGHMDDGGYSMNFNM +SSSRGPLPVKRGPPPRSGGPSPKRSAPSGLVRSSSGMGGRAPLSRGRDSYGGPPRREPLPSRRDVYLSPR +DDGYSTKDSYSSRDYPSSRDTRDYAPPPRDYTYRDYGHSSSRDDYPSRGYGDRDGYGRDRDYSDHPSGGS +YRDSYESYGNSRSAPLTRGPPPSYGGSSRYDDYSSSRDGYGGSRDSYSSSRSDLYSSCDRVGRQERGLPP +SVERGYPSSRDSYSSSSRGAPRGAGPGGSRSDRGGGRSRY +>ENSP00000446099 pep:known chromosome:GRCh37:1:89445142:89458643:-1 gene:ENSG00000213516 transcript:ENST00000399794 gene_biotype:protein_coding transcript_biotype:protein_coding +MVEADRPGKLFIGGLNTETNEKALETVFGKYGRIVEVLLIKDRETNKSRGFAFVTFESPADAKDAARDMN +GKSLDGKAIKVEQATKPSFERGRHGPPPPPRSRGPPRGFGAGRGGSGGTRGPPSRGGHMDDGGYSMNFNM +SSSRGPLPVKRGPPPRSGGPSPKRSAPSGLVRSSSGMGGRAPLSRGRDSYGGPPRREPLPSRRDVYLSPR +DDGYSTKDSYSSRDYPSSRDTRDYAPPPRDYTYRDYGHSSSRDDYPSRGYGDRDGYGRDRDYSDHPSGGS +YRDSYESYGNSRSAPLTRGPPPSYGGSSRYDDYSSSRDGYGGSRDSYSSSRSDLYSSCDRVGRQERGLPP +SVERGYPSSRDSYSSSSRGAPRGAGPGGSRSDRGGGRSRY |
b |
diff -r 000000000000 -r fcb7188fa0d2 test-data/all_pep.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_pep.tabular Fri Feb 07 15:05:20 2014 -0500 |
b |
@@ -0,0 +1,4 @@ +ENSP00000363782 pep:known chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding MELPNYSRQLLQQLYTLCKEQQFCDCTISIGTIYFRAHKLVLAAASLLFKTLLDNTDTISIDASVVSPEEFALLLEMMYTGKLPVGKHNFSKIISLADSLQMFDVAVSCKNLLTSLVNCSVQGQVVRDVSAPSSETFRKEPEKPQVEILSSEGAGEPHSSPELAATPGGPVKAETEEAAHSVSQEMSVNSPTAQESQRNAETPAETPTTAEACSPSPAVQTFSEAKKTSTEPGCERKHYQLNFLLENEGVFSDALMVTQDVLKKLEMCSEIKGPQKEVILNCCEGRTPKETIENLLHRMTEEKTLTAEGLVKLLQAVKTTFPNLGLLLEKLQKSATLPSTTVQPSPDDYGTELLRRYHENLSEIFTDNQILLKMISHMTSLAPGEREVMEKLVKRDSGSGGFNSLISAVLEKQTLSATAIWQLLLVVQETKTCPLDLLMEEIRREPGADAFFRAVTTPEHATLETILRHNQLILEAIQQKIEYKLFTSEEEHLAETVKEILSIPSETASPEASLRAVLSRAMEKSVPAIEICHLLCSVHKSFPGLQPVMQELAYIGVLTKEDGEKETWKVSNKFHLEANNKEDEKAAKEDSQPGEQNDQGETGSLPGQQEKEASASPDPAKKSFICKACDKSFHFYCRLKVHMKRCRVAKSKQVQCKECSETKDSKKELDKHQLEAHGAGGEPDAPKKKKKRLPVTCDLCGREFAHASGMQYHKLTEHFDEKPFSCEECGAKFAANSTLKNHLRLHTGDRPFMCKHCLMTFTQASALAYHTKKKHSEGKMYACQYCDAVFAQSIELSRHVRTHTGDKPYVCRDCGKGFRQANGLSIHLHTFHNIEDPYDCKKCRMSFPTLQDHRKHIHEVHSKEYHPCPTCGKIFSAPSMLERHVVTHVGGKPFSCGICNKAYQQLSGLWYHNRTHHPDVFAAQNHRSSKFSSLQCSSCDKTFPNTIEHKKHIKAEHADMKFHECDQCKELFPTPALLQVHVKCQHSGSQPFRCLYCAATFRFPGALQHHVTTEHFKQSETTFPCELCGELFTSQAQLDSHLESEHPKVMSTETQAAASQMAQVIQTPEPVAPTEQVITLEETQLAGSQVFVTLPDSQASQASSELVAVTVEDLLDGTVTLICGEAK +ENSP00000346634 pep:known chromosome:GRCh37:1:36690017:36770958:1 gene:ENSG00000054118 transcript:ENST00000354618 gene_biotype:protein_coding transcript_biotype:protein_coding MSKTNKSKSGSRSSRSRSASRSRSRSFSKSRSRSRSLSRSRKRRLSSRSRSRSYSPAHNRERNHPRVYQNRDFRGHNRGYRRPYYFRGRNRGFYPWGQYNRGGYGNYRSNWQNYRQAYSPRRGRSRSRSPKRRSPSPRSRSHSRNSDKSSSDRSRRSSSSRSSSNHSRVESSKRKSAKEKKSSSKDSRPSQAAGDNQGDEAKEQTFSGGTSQDTKASESSKPWPDATYGTGSASRASAVSELSPRERSPALKSPLQSVVVRRRSPRPSPVPKPSPPLSSTSQMGSTLPSGAGYQSGTHQGQFDHGSGSLSPSKKSPVGKSPPSTGSTYGSSQKEESAASGGAAYTKRYLEEQKTENGKDKEQKQTNTDKEKIKEKGSFSDTGLGDGKMKSDSFAPKTDSEKPFRGSQSPKRYKLRDDFEKKMADFHKEEMDDQDKDKAKGRKESEFDDEPKFMSKVIGANKNQEEEKSGKWEGLVYAPPGKEKQRKTEELEEESFPERSKKEDRGKRSEGGHRGFVPEKNFRVTAYKAVQEKSSSPPPRKTSESRDKLGAKGDFPTGKSSFSITREAQVNVRMDSFDEDLARPSGLLAQERKLCRDLVHSNKKEQEFRSIFQHIQSAQSQRSPSELFAQHIVTIVHHVKEHHFGSSGMTLHERFTKYLKRGTEQEAAKNKKSPEIHRRIDISPSTFRKHGLAHDEMKSPREPGYKAEGKYKDDPVDLRLDIERRKKHKERDLKRGKSRESVDSRDSSHSRERSAEKTEKTHKGSKKQKKHRRARDRSRSSSSSSQSSHSYKAEEYTEETEEREESTTGFDKSRLGTKDFVGPSERGGGRARGTFQFRARGRGWGRGNYSGNNNNNSNNDFQKRNREEEWDPEYTPKSKKYYLHDDREGEGSDKWVSRGRGRGAFPRGRGRFMFRKSSTSPKWAHDKFSGEEGEIEDDESGTENREEKDNIQPTTE +ENSP00000318415 pep:known chromosome:GRCh37:1:89445139:89458455:-1 gene:ENSG00000213516 transcript:ENST00000321792 gene_biotype:protein_coding transcript_biotype:protein_coding MVEADRPGKLFIGGLNTETNEKALETVFGKYGRIVEVLLIKDRETNKSRGFAFVTFESPADAKDAARDMNGKSLDGKAIKVEQATKPSFERGRHGPPPPPRSRGPPRGFGAGRGGSGGTRGPPSRGGHMDDGGYSMNFNMSSSRGPLPVKRGPPPRSGGPSPKRSAPSGLVRSSSGMGGRAPLSRGRDSYGGPPRREPLPSRRDVYLSPRDDGYSTKDSYSSRDYPSSRDTRDYAPPPRDYTYRDYGHSSSRDDYPSRGYGDRDGYGRDRDYSDHPSGGSYRDSYESYGNSRSAPLTRGPPPSYGGSSRYDDYSSSRDGYGGSRDSYSSSRSDLYSSCDRVGRQERGLPPSVERGYPSSRDSYSSSSRGAPRGAGPGGSRSDRGGGRSRY +ENSP00000446099 pep:known chromosome:GRCh37:1:89445142:89458643:-1 gene:ENSG00000213516 transcript:ENST00000399794 gene_biotype:protein_coding transcript_biotype:protein_coding MVEADRPGKLFIGGLNTETNEKALETVFGKYGRIVEVLLIKDRETNKSRGFAFVTFESPADAKDAARDMNGKSLDGKAIKVEQATKPSFERGRHGPPPPPRSRGPPRGFGAGRGGSGGTRGPPSRGGHMDDGGYSMNFNMSSSRGPLPVKRGPPPRSGGPSPKRSAPSGLVRSSSGMGGRAPLSRGRDSYGGPPRREPLPSRRDVYLSPRDDGYSTKDSYSSRDYPSSRDTRDYAPPPRDYTYRDYGHSSSRDDYPSRGYGDRDGYGRDRDYSDHPSGGSYRDSYESYGNSRSAPLTRGPPPSYGGSSRYDDYSSSRDGYGGSRDSYSSSRSDLYSSCDRVGRQERGLPPSVERGYPSSRDSYSSSSRGAPRGAGPGGSRSDRGGGRSRY |
b |
diff -r 000000000000 -r fcb7188fa0d2 test-data/peptides_10_10.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/peptides_10_10.fa Fri Feb 07 15:05:20 2014 -0500 |
b |
@@ -0,0 +1,12 @@ +>ENSP00000363782 pep:sap chromosome:GRCh37:1:22778472:22853855:1 gene:ENSG00000184677 transcript:ENST00000374651 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:22846709 codon_change:Gtg/Atg sap:V885M +FSAPSMLERHMVTHVGGKPFS +>ENSP00000346634 pep:sap chromosome:GRCh37:1:36690017:36770958:1 gene:ENSG00000054118 transcript:ENST00000354618 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:36752433 codon_change:gCc/gTc sap:A201V +QAAGDNQGDEVKEQTFSGGTS +>ENSP00000318415 pep:sap chromosome:GRCh37:1:89445139:89458455:-1 gene:ENSG00000213516 transcript:ENST00000321792 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:89449390 codon_change:atA/atG sap:I40M +KYGRIVEVLLMKDRETNKSRG +>ENSP00000446099 pep:sap chromosome:GRCh37:1:89445142:89458643:-1 gene:ENSG00000213516 transcript:ENST00000399794 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:89449390 codon_change:atA/atG sap:I40M +KYGRIVEVLLMKDRETNKSRG +>ENSP00000318415 pep:sap chromosome:GRCh37:1:89445139:89458455:-1 gene:ENSG00000213516 transcript:ENST00000321792 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:89449434 codon_change:Aca/Gca sap:T26A +NTETNEKALEAVFGKYGRIVE +>ENSP00000446099 pep:sap chromosome:GRCh37:1:89445142:89458643:-1 gene:ENSG00000213516 transcript:ENST00000399794 gene_biotype:protein_coding transcript_biotype:protein_coding snp_location:chr1:89449434 codon_change:Aca/Gca sap:T26A +NTETNEKALEAVFGKYGRIVE |
b |
diff -r 000000000000 -r fcb7188fa0d2 test-data/snpeff.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/snpeff.vcf Fri Feb 07 15:05:20 2014 -0500 |
[ |
@@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##samtoolsVersion=0.1.18 (r982:295) +##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth"> +##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"> +##INFO=<ID=MQ,Number=1,Type=Integer,Description="Root-mean-square mapping quality of covering reads"> +##INFO=<ID=FQ,Number=1,Type=Float,Description="Phred probability of all samples being the same"> +##INFO=<ID=AF1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele frequency (assuming HWE)"> +##INFO=<ID=AC1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele count (no HWE assumption)"> +##INFO=<ID=G3,Number=3,Type=Float,Description="ML estimate of genotype frequencies"> +##INFO=<ID=HWE,Number=1,Type=Float,Description="Chi^2 based HWE test P-value based on G3"> +##INFO=<ID=CLR,Number=1,Type=Integer,Description="Log ratio of genotype likelihoods with and without the constraint"> +##INFO=<ID=UGT,Number=1,Type=String,Description="The most probable unconstrained genotype configuration in the trio"> +##INFO=<ID=CGT,Number=1,Type=String,Description="The most probable constrained genotype configuration in the trio"> +##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias"> +##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL."> +##INFO=<ID=PC2,Number=2,Type=Integer,Description="Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2."> +##INFO=<ID=PCHI2,Number=1,Type=Float,Description="Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples."> +##INFO=<ID=QCHI2,Number=1,Type=Integer,Description="Phred scaled PCHI2."> +##INFO=<ID=PR,Number=1,Type=Integer,Description="# permutations yielding a smaller PCHI2."> +##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> +##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)"> +##FORMAT=<ID=DP,Number=1,Type=Integer,Description="# high-quality bases"> +##FORMAT=<ID=SP,Number=1,Type=Integer,Description="Phred-scaled strand bias P-value"> +##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods"> +##SnpEffVersion="3.4 (build 2013-11-27), by Pablo Cingolani" +##SnpEffCmd="SnpEff -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -snp -no-downstream -no-intergenic -no-intron -no-upstream -no-utr -stats /galaxy/PRODUCTION/database/files/000/376/dataset_376197.dat GRCh37.73 /galaxy/PRODUCTION/database/files/000/376/dataset_376194.dat " +##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )' "> +##SnpEffCmd="SnpEff -i vcf -o vcf -upDownStreamLen 5000 -spliceSiteSize 1 -geneId -no-downstream -no-intergenic -no-intron -no-upstream -no-utr -stats /galaxy/PRODUCTION/database/files/000/376/dataset_376199.dat GRCh37.73 /galaxy/PRODUCTION/database/files/000/376/dataset_376196.dat " +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT /galaxy/PRODUCTION/database/tmp/tmp-SAMTOOLS-BmXdVn/bam_input_0.bam +chr1 22846709 . G A 9.31 . DP=2;VDB=0.0174;AF1=1;AC1=2;DP4=0,0,1,1;MQ=20;FQ=-33;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Gtg/Atg|V885M|1127|ZBTB40|protein_coding|CODING|ENST00000374651|12|1) PL 40,6,0 +chr1 36752433 . C T 6.02 . DP=2;VDB=0.0099;AF1=1;AC1=2;DP4=0,0,2,0;MQ=20;FQ=-33;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|gCc/gTc|A201V|955|THRAP3|protein_coding|CODING|ENST00000354618|4|1) PL 36,6,0 +chr1 89449390 . T C 15.1 . DP=3;VDB=0.0214;AF1=1;AC1=2;DP4=0,0,1,2;MQ=20;FQ=-36;EFF=EXON(MODIFIER|||||RBMXL1|processed_transcript|CODING|ENST00000413769|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|atA/atG|I40M|390|RBMXL1|protein_coding|CODING|ENST00000321792|2|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|atA/atG|I40M|390|RBMXL1|protein_coding|CODING|ENST00000399794|3|1);EFF=EXON(MODIFIER|||||ENSG00000213516|processed_transcript|CODING|ENST00000413769|3|1) PL 47,9,0 +chr1 89449434 . T C 9.31 . DP=2;VDB=0.0120;AF1=1;AC1=2;DP4=0,0,1,1;MQ=20;FQ=-33;EFF=EXON(MODIFIER|||||RBMXL1|processed_transcript|CODING|ENST00000413769|3|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Aca/Gca|T26A|390|RBMXL1|protein_coding|CODING|ENST00000321792|2|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Aca/Gca|T26A|390|RBMXL1|protein_coding|CODING|ENST00000399794|3|1);EFF=EXON(MODIFIER|||||ENSG00000213516|processed_transcript|CODING|ENST00000413769|3|1) PL 40,6,0 |