Previous changeset 0:6430407e5869 (2015-04-03) Next changeset 2:503ab8a39006 (2016-12-07) |
Commit message:
Uploaded |
modified:
unipept.py unipept.xml |
added:
test-data/peptide.fa test-data/tryptic.fa test-data/tryptic.tsv |
removed:
.shed.yml |
b |
diff -r 6430407e5869 -r 0c1ee95282fa .shed.yml --- a/.shed.yml Fri Apr 03 14:55:49 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,18 +0,0 @@ -categories: - - Proteomics - - Metaproteomics -description: Unipept retrieves metaproteomics information -homepage_url: https://github.com/galaxyproteomics/tools-galaxyp -long_description: 'Unipept retrieves taxonomy for tryptic peptides using the unipept API. - http://unipept.ugent.be - http://unipept.ugent.be/apidocs - - The Unipept metaproteomics analysis pipeline - Bart Mesuere1,*, Griet Debyser2, Maarten Aerts3, Bart Devreese2, Peter Vandamme3 andPeter Dawyndt1 - Article first published online: 11 FEB 2015 - DOI: 10.1002/pmic.201400361 - http://onlinelibrary.wiley.com/doi/10.1002/pmic.201400361/abstract;jsessionid=BFF1994E4C14DA73D7C907EB208AD710.f04t04 - ' -name: unipept -owner: galaxyp -remote_repository_url: http://unipept.ugent.be/apidocs |
b |
diff -r 6430407e5869 -r 0c1ee95282fa test-data/peptide.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/peptide.fa Tue Apr 14 16:44:22 2015 -0400 |
b |
@@ -0,0 +1,9 @@ +>tr|G3RWV1|G3RWV1_GORGO +VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKG +SNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV +>sp|Q9BYE9|CDHR2_HUMAN +VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPRIPIDDLTMVVYDPDKG +SNGTFLLSLGGPDAEAFSVSPERAVGSASVQVLVRVSALVDYERQTAMAV +>tr|H2QS28|H2QS28_PANTR +VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPRIPIDDLTMVVYDPDKG +SNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSGLVDYERQTAMAV |
b |
diff -r 6430407e5869 -r 0c1ee95282fa test-data/tryptic.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tryptic.fa Tue Apr 14 16:44:22 2015 -0400 |
b |
@@ -0,0 +1,19 @@ +>trypticQTAMAV +QTAMAV +>trypticAAGSASVQVLVR +AAGSASVQVLVR +>trypticAVGSASVQVLVR +AVGSASVQVLVR +>trypticIPIDDLTMVVYDPDK +IPIDDLTMVVYDPDK +>trypticVSGLVDYER +VSGLVDYER +>trypticVSALVDYER +VSALVDYER +>trypticGSNGTFLLSLGGPDAEAFSVSPER +GSNGTFLLSLGGPDAEAFSVSPER +>trypticVMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR +VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR + + + |
b |
diff -r 6430407e5869 -r 0c1ee95282fa test-data/tryptic.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tryptic.tsv Tue Apr 14 16:44:22 2015 -0400 |
b |
@@ -0,0 +1,8 @@ +1 QTAMAV QTAMAV +2 AAGSASVQVLVR AAGSASJQVLVR +3 AVGSASVQVLVR AVGSASVQVLVR +4 IPIDDLTMVVYDPDK IPIDDLTMVVYDPDK +5 GSNGTFLLSLGGPDAEAFSVSPER GSNGTFLLSLGGPDAEAFSVSPE +6 VSGLVDYER VSGLVDYER +7 VSALVDYER VSALVDYER +8 VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPR |
b |
diff -r 6430407e5869 -r 0c1ee95282fa unipept.py --- a/unipept.py Fri Apr 03 14:55:49 2015 -0400 +++ b/unipept.py Tue Apr 14 16:44:22 2015 -0400 |
[ |
b'@@ -31,39 +31,86 @@\n if exit_code:\n sys.exit(exit_code)\n \n-def read_fasta(fp):\n+pept2lca_column_order = [\'peptide\',\'taxon_rank\',\'taxon_id\',\'taxon_name\']\n+pept2lca_extra_column_order = [\'peptide\',\'superkingdom\',\'kingdom\',\'subkingdom\',\'superphylum\',\'phylum\',\'subphylum\',\'superclass\',\'class\',\'subclass\',\'infraclass\',\'superorder\',\'order\',\'suborder\',\'infraorder\',\'parvorder\',\'superfamily\',\'family\',\'subfamily\',\'tribe\',\'subtribe\',\'genus\',\'subgenus\',\'species_group\',\'species_subgroup\',\'species\',\'subspecies\',\'varietas\',\'forma\' ]\n+pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]\n+pept2prot_column_order = [\'peptide\',\'uniprot_id\',\'taxon_id\']\n+pept2prot_extra_column_order = pept2prot_column_order + [\'taxon_name\',\'ec_references\',\'go_references\',\'refseq_ids\',\'refseq_protein_ids\',\'insdc_ids\',\'insdc_protein_ids\']\n+\n+def __main__():\n+ version = \'1.1\'\n+ pep_pat = \'^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$\'\n+\n+ def read_tabular(filepath,col):\n+ peptides = []\n+ with open(filepath) as fp:\n+ for i,line in enumerate(fp):\n+ if line.strip() == \'\' or line.startswith(\'#\'):\n+ continue\n+ fields = line.rstrip(\'\\n\').split(\'\\t\')\n+ peptide = fields[col]\n+ if not re.match(pep_pat,peptide):\n+ warn_err(\'"%s" is not a peptide (line %d column %d of tabular file: %s)\\n\' % (peptide,i,col,filepath),exit_code=invalid_ec)\n+ peptides.append(peptide)\n+ return peptides\n+\n+ def get_fasta_entries(fp):\n name, seq = None, []\n for line in fp:\n- line = line.rstrip()\n- if line.startswith(">"):\n- if name: yield (name, \'\'.join(seq))\n- name, seq = line, []\n- else:\n- seq.append(line)\n+ line = line.rstrip()\n+ if line.startswith(">"):\n+ if name: yield (name, \'\'.join(seq))\n+ name, seq = line, []\n+ else:\n+ seq.append(line)\n if name: yield (name, \'\'.join(seq))\n \n-def read_mzid(fp):\n- peptides = []\n- for event, elem in ET.iterparse(fp):\n- if event == \'end\':\n- if re.search(\'PeptideSequence\',elem.tag):\n- peptides.append(elem.text)\n- return peptides\n+ def read_fasta(filepath):\n+ peptides = []\n+ with open(filepath) as fp:\n+ for id, peptide in get_fasta_entries(fp):\n+ if not re.match(pep_pat,peptide):\n+ warn_err(\'"%s" is not a peptide (id %s of fasta file: %s)\\n\' % (peptide,id,filepath),exit_code=invalid_ec)\n+ peptides.append(peptide)\n+ return peptides\n+\n+ def read_mzid(fp):\n+ peptides = []\n+ for event, elem in ET.iterparse(fp):\n+ if event == \'end\':\n+ if re.search(\'PeptideSequence\',elem.tag):\n+ peptides.append(elem.text)\n+ return peptides\n \n-def read_pepxml(fp):\n- peptides = []\n- for event, elem in ET.iterparse(fp):\n- if event == \'end\':\n- if re.search(\'search_hit\',elem.tag):\n- peptides.append(elem.get(\'peptide\'))\n- return peptides\n+ def read_pepxml(fp):\n+ peptides = []\n+ for event, elem in ET.iterparse(fp):\n+ if event == \'end\':\n+ if re.search(\'search_hit\',elem.tag):\n+ peptides.append(elem.get(\'peptide\'))\n+ return peptides\n \n-def __main__():\n+ def best_match(peptide,matches):\n+ if not matches:\n+ return None\n+ elif len(matches) == 1:\n+ return matches[0].copy()\n+ else:\n+ # find the most specific match (peptide is always the first column order field)\n+ for col in reversed(pept2lca_extra_column_order[1:]):\n+ col_id = col+"_id" if options.extra else col\n+ for match in matches:\n+ if \'taxon_rank\' in match and match[\'taxon_rank\'] == col:\n+ return match.copy()\n+ if col_id in match and match[col_id]:\n+ return match.copy()\n+ return None\n+\n #Parse Command Line\n parser = optparse.OptionParser()\n- # unipept API\n- parser.add_option( \'-A\', \'--api\', dest=\'unipept\', default=\'pept2lca\', choices=[\'pept2lca\',\'pept2taxa\',\'pept2prot\'], help=\'The unipept application: pept2lca, pept2taxa, or pep'..b'rint >> sys.stdout,"unipept response: %s\\n" % str(unipept_resp)\n+ if options.unipept == \'pept2prot\' or options.unipept == \'pept2taxa\':\n+ dupkey = \'uniprot_id\' if options.unipept == \'pept2prot\' else \'taxon_id\' ## should only keep one of these per input peptide\n+ ## multiple entries per trypticPeptide for pep2prot or pep2taxa\n+ mapping = {}\n+ for match in unipept_resp:\n+ mapping.setdefault(match[\'peptide\'],[]).append(match)\n+ for peptide in peptides:\n+ # Get the intersection of matches to the tryptic parts\n+ keyToMatch = None\n+ for part in pepToParts[peptide]:\n+ if part in mapping:\n+ temp = {match[dupkey] : match for match in mapping[part]}\n+ if keyToMatch:\n+ dkeys = set(keyToMatch.keys()) - set(temp.keys())\n+ for k in dkeys:\n+ del keyToMatch[k]\n+ else:\n+ keyToMatch = temp\n+ ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp\n+ if not keyToMatch:\n+ unmatched_peptides.append(peptide)\n+ else:\n+ for key,match in keyToMatch.iteritems():\n+ match[\'tryptic_peptide\'] = match[\'peptide\']\n+ match[\'peptide\'] = peptide\n+ peptideMatches.append(match)\n+ else:\n+ ## should be one response per trypticPeptide for pep2lca\n+ respMap = {v[\'peptide\']:v for v in unipept_resp}\n+ ## map resp back to peptides\n+ for peptide in peptides:\n+ matches = list()\n+ for part in pepToParts[peptide]:\n+ if part in respMap:\n+ matches.append(respMap[part])\n+ match = best_match(peptide,matches)\n+ if not match:\n+ unmatched_peptides.append(peptide)\n+ longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]\n+ match = {\'peptide\' : longest_tryptic_peptide}\n+ match[\'tryptic_peptide\'] = match[\'peptide\']\n+ match[\'peptide\'] = peptide\n+ peptideMatches.append(match)\n+ resp = peptideMatches\n+ if options.debug: print >> sys.stdout,"\\nmapped response: %s\\n" % str(resp)\n ## output results\n- if not (options.mismatch or options.json or options.tsv or options.csv):\n+ if not (options.unmatched or options.json or options.tsv or options.csv):\n print >> sys.stdout, str(resp)\n- if options.mismatch:\n- peptides_matched = []\n- for i,pdict in enumerate(resp):\n- peptides_matched.append(pdict[\'peptide\'])\n- with open(options.mismatch,\'w\') as outputFile:\n+ if options.unmatched:\n+ with open(options.unmatched,\'w\') as outputFile:\n for peptide in peptides:\n- if not peptide in peptides_matched:\n+ if peptide in unmatched_peptides:\n outputFile.write("%s\\n" % peptide)\n if options.json:\n with open(options.json,\'w\') as outputFile:\n outputFile.write(str(resp)) \n if options.tsv or options.csv:\n # \'pept2lca\',\'pept2taxa\',\'pept2prot\'\n- pept2lca_column_order = [ \'peptide\',\'superkingdom\',\'kingdom\',\'subkingdom\',\'superphylum\',\'phylum\',\'subphylum\',\'superclass\',\'class_\',\'subclass\',\'infraclass\',\'superorder\',\'order\',\'suborder\',\'infraorder\',\'parvorder\',\'superfamily\',\'family\',\'subfamily\',\'tribe\',\'subtribe\',\'genus\',\'subgenus\',\'species_group\',\'species_subgroup\',\'species\',\'subspecies\',\'varietas\',\'forma\' ]\n- pept2prot_column_order = [ \'peptide\',\'uniprot_id\',\'taxon_id\',\'taxon_name\',\'ec_references\',\'go_references\',\'refseq_ids\',\'refseq_protein_ids\',\'insdc_ids\',\'insdc_protein_ids\']\n- column_order = pept2prot_column_order if options.unipept == \'pept2prot\' else pept2lca_column_order\n found_keys = set()\n results = []\n for i,pdict in enumerate(resp):\n@@ -179,7 +282,8 @@\n taxa = []\n for i,pdict in enumerate(results):\n vals = [str(pdict[x]) if x in pdict and pdict[x] else \'\' for x in column_keys]\n- taxa.append(vals)\n+ if vals not in taxa:\n+ taxa.append(vals)\n if options.tsv:\n with open(options.tsv,\'w\') as outputFile:\n outputFile.write("#%s\\n"% \'\\t\'.join(column_names))\n' |
b |
diff -r 6430407e5869 -r 0c1ee95282fa unipept.xml --- a/unipept.xml Fri Apr 03 14:55:49 2015 -0400 +++ b/unipept.xml Tue Apr 14 16:44:22 2015 -0400 |
b |
b'@@ -1,8 +1,8 @@\n-<tool id="unipept" name="Unipept" version="0.1.0">\n+<tool id="unipept" name="Unipept" version="1.1.0">\n <description>retrieve taxonomy for peptides</description>\n <macros>\n <xml name="equate_il">\n- <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="false" label="Equate isoleucine and leucine">\n+ <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="true" label="Equate isoleucine and leucine">\n <help>isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records</help>\n </param >\n </xml>\n@@ -13,7 +13,10 @@\n </xml>\n <xml name="names">\n <param name="names" type="boolean" truevalue="-n" falsevalue="" checked="true" label="names" >\n- <help>return the names of taxons</help>\n+ <help>return the names in complete taxonomic lineage</help>\n+ </param >\n+ <param name="allfields" type="boolean" truevalue="-A" falsevalue="" checked="false" label="allfields" >\n+ <help>include fields for most specific taxonomic classification: taxon_rank,taxon_id,taxon_name before lineage</help>\n </param >\n </xml>\n </macros>\n@@ -27,7 +30,7 @@\n --api=$unipept.api\n $unipept.equate_il $unipept.extra \n #if $unipept.api != \'pept2prot\':\n- $unipept.names \n+ $unipept.names $unipept.allfields\n #end if\n $strict\n #if str($peptide_src.fmt) == \'proteomic\':\n@@ -58,29 +61,29 @@\n #if \'csv\' in str($outputs).split(\',\'):\n --csv $output_csv\n #end if\n- #if \'mismatch\' in str($outputs).split(\',\'):\n- --mismatch $output_mismatch\n+ #if \'unmatched\' in str($outputs).split(\',\'):\n+ --unmatched $output_unmatched\n #end if\n ]]></command>\n <inputs>\n <conditional name="unipept">\n <param name="api" type="select" label="Unipept application" >\n- <option value="pept2taxa" selected="true">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>\n- <option value="pept2lca">pept2lca: lowest common ancestor</option>\n+ <option value="pept2lca" selected="true">pept2lca: lowest common ancestor</option>\n+ <option value="pept2taxa">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option>\n <option value="pept2prot">pept2prot: UniProt entries containing a given tryptic peptide</option>\n </param>\n+ <when value="pept2lca">\n+ <expand macro="equate_il" />\n+ <expand macro="extra">\n+ <help>Return the complete lineage of the taxonomic lowest common ancestor, and include ID fields.</help>\n+ </expand>\n+ <expand macro="names" />\n+ </when>\n <when value="pept2taxa">\n <expand macro="equate_il" />\n <expand macro="extra">\n <checked>true</checked>\n- <help>Return the complete lineage of each organism.</help>\n- </expand>\n- <expand macro="names" />\n- </when>\n- <when value="pept2lca">\n- <expand macro="equate_il" />\n- <expand macro="extra">\n- <help>Return the complete lineage of the taxonomic lowest common ancestor.</help>\n+ <help>Return the complete lineage of each organism, and include ID fields.</help>\n </expand>\n <expand macro="names" />\n </when>\n@@ -122,7 +125,7 @@\n <option value="tsv" selected="true">tabular</option>\n <option value="csv">Comma Separated Values (.csv)</option>\n <option value="json">JSON</option>\n- <option value="mismatch">Mismatches</option>\n+ <option value="unmatched">Unmatched peptides</option>\n </par'..b' <param name="input_fasta" value="peptide.fa"/>\n <param name="equate_il" value="True"/>\n <param name="extra" value="True"/>\n <param name="names" value="True"/>\n- <param name="outputs" value="json,mismatch"/>\n+ <param name="outputs" value="json,tsv"/>\n <output name="output_json">\n <assert_contents>\n- <has_text text="AIPQLEVARPADAYETAEAYR" />\n+ <has_text text="VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKGSNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV" />\n+ </assert_contents>\n+ </output>\n+ <output name="output_tsv">\n+ <assert_contents>\n+ <has_text text="9606" />\n+ <has_text text="9598" />\n </assert_contents>\n </output>\n- <output name="output_mismatch">\n+ </test>\n+ <test>\n+ <param name="api" value="pept2taxa"/>\n+ <param name="fmt" value="fasta"/>\n+ <param name="input_fasta" value="peptide.fa"/>\n+ <param name="equate_il" value="True"/>\n+ <param name="extra" value="False"/>\n+ <param name="names" value="False"/>\n+ <param name="outputs" value="tsv"/>\n+ <output name="output_tsv">\n <assert_contents>\n- <has_text text="DQIAHEGK" />\n+ <has_text text="sapiens" />\n+ <has_text text="troglodytes" />\n+ <has_text text="Gorilla" />\n+ <has_text text="Macaca" />\n </assert_contents>\n </output>\n </test>\n@@ -182,8 +205,24 @@\n **Unipept** \n \n Retrieve Uniprot and taxanomic information for trypic peptides.\n+ \n+ Unipept API documentation - http://unipept.ugent.be/apidocs \n \n- **pept2prot**\n+ **Input**\n+\n+ Input peptides can be retrieved from tabular, fasta, mzid, or pepxml datasets. \n+ \n+ Processing deatils::\n+\n+ The input peptides are split into typtic peptide fragments in order to match the Unipept records. \n+ Only fragments that are complete tryptic peptides between 5 and 50 animo acid in length will be matched by Unipept.\n+ The match to the most specific tryptic fragment is reported.\n+\n+\n+ **Unipept APIs**\n+\n+ **pept2prot** - http://unipept.ugent.be/apidocs/pept2prot\n+\n Returns the list of UniProt entries containing a given tryptic peptide. This is the same information as provided on the Protein matches tab when performing a search with the Tryptic Peptide Analysis in the web interface. \n \n By default, each object contains the following information fields extracted from the UniProt record::\n@@ -202,9 +241,9 @@\n insdc_ids: a space separated list of associated insdc accession numbers\n insdc_protein_ids: a space separated list of associated insdc protein accession numbers\n \n- http://unipept.ugent.be/apidocs/pept2prot\n \n- **pept2taxa**\n+ **pept2taxa** - http://unipept.ugent.be/apidocs/pept2taxa\n+\n Returns the set of organisms associated with the UniProt entries containing a given tryptic peptide. This is the same information as provided on the Lineage table tab when performing a search with the Tryptic Peptide Analysis in the web interface.\n \n By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::\n@@ -245,9 +284,9 @@\n varietas_id\n forma_id\n \n- http://unipept.ugent.be/apidocs/pept2taxa\n \n- **pept2lca** \n+ **pept2lca** - http://unipept.ugent.be/apidocs/pept2lca\n+\n Returns the taxonomic lowest common ancestor for a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface.\n \n By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy::\n@@ -288,7 +327,6 @@\n varietas_id\n forma_id\n \n- http://unipept.ugent.be/apidocs/pept2lca\n \n **Attributions**\n \n' |