Mercurial > repos > galaxyp > unipept
changeset 7:75b3b3d0adbf draft
"planemo upload for repository http://unipept.ugent.be/apidocs commit b6707ea113b2a89b0bb8072dfcc9ceeef4a1b708"
author | galaxyp |
---|---|
date | Tue, 06 Apr 2021 16:13:57 +0000 |
parents | 9aaa46d45472 |
children | 7863f1abcdda |
files | test-data/input.tsv unipept.py unipept.xml |
diffstat | 3 files changed, 124 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/test-data/input.tsv Tue Jun 02 10:30:01 2020 -0400 +++ b/test-data/input.tsv Tue Apr 06 16:13:57 2021 +0000 @@ -3,3 +3,5 @@ 3 APVLSDSSCK APVLJDSSCK 4 DQIAHEGK DQUAHEGK 5 ATLTSGAAR ATLTSGAAR +5 FAPLLEEYKAEDWVQK AEDWVQK +6 AVKAMHRRKVKGAIMGNSKTGSIVYIEPEITLRYSRELSNLEYEEKEEVT GSIVYIEPEITLRYSRE
--- a/unipept.py Tue Jun 02 10:30:01 2020 -0400 +++ b/unipept.py Tue Apr 06 16:13:57 2021 +0000 @@ -9,8 +9,8 @@ """ import json import optparse +import re import sys -import re import urllib.error import urllib.parse import urllib.request @@ -102,7 +102,7 @@ } pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] -pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] +pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[2:] pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] @@ -487,6 +487,8 @@ parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') + parser.add_option('-P', '--peptide_match', dest='peptide_match', choices=['best', 'full', 'report'], default='best', help='Match whole peptide') + parser.add_option('--unmatched_aa', dest='unmatched_aa', default=None, help='Show unmatched AA in peptide as') # debug parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') @@ -495,6 +497,16 @@ if options.version: print('%s' % version) sys.exit(0) + + def tryptic_match_string(peptide, tryptic_matches): + if options.unmatched_aa: + p = peptide.lower() + for m in tryptic_matches: + p = p.replace(m.lower(), m) + return re.sub('[a-z]', options.unmatched_aa, p) + else: + return ','.join(tryptic_matches) + invalid_ec = 2 if options.strict else None peptides = [] # Get peptide sequences @@ -522,7 +534,12 @@ else: column_order = pept2lca_column_order # map to tryptic peptides - pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} + if options.peptide_match == 'full': + pepToParts = {p: [p] for p in peptides} + else: + pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} + if options.debug: + print("column_order: %s\n" % (column_order), file=sys.stderr) partToPeps = {} for peptide, parts in pepToParts.items(): if options.debug: @@ -591,8 +608,10 @@ for peptide in peptides: # Get the intersection of matches to the tryptic parts keyToMatch = None + tryptic_match = [] for part in pepToParts[peptide]: if part in mapping: + tryptic_match.append(part) temp = {match[dupkey]: match for match in mapping[part]} if keyToMatch: dkeys = set(keyToMatch.keys()) - set(temp.keys()) @@ -605,6 +624,7 @@ unmatched_peptides.append(peptide) else: for key, match in keyToMatch.items(): + match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) @@ -614,14 +634,17 @@ # map resp back to peptides for peptide in peptides: matches = list() + tryptic_match = [] for part in pepToParts[peptide]: if part in respMap: + tryptic_match.append(part) matches.append(respMap[part]) match = best_match(peptide, matches) if not match: unmatched_peptides.append(peptide) longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] match = {'peptide': longest_tryptic_peptide} + match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) @@ -630,14 +653,17 @@ # map resp back to peptides for peptide in peptides: matches = list() + tryptic_match = [] for part in pepToParts[peptide]: if part in respMap and 'total_protein_count' in respMap[part]: + tryptic_match.append(part) matches.append(respMap[part]) match = best_match(peptide, matches) if not match: unmatched_peptides.append(peptide) longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] match = {'peptide': longest_tryptic_peptide} + match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) @@ -686,6 +712,10 @@ total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' column_names = ['peptide', 'total_protein_count'] vals = [peptide, total_protein_count] + if options.peptide_match == 'report': + column_names = ['peptide', 'tryptic_match', 'total_protein_count'] + tryptic_match = pdict.get('tryptic_match', '') + vals = [peptide, tryptic_match, total_protein_count] if ec_dict: vals += ec_dict[peptide] column_names += ec_cols @@ -700,11 +730,11 @@ column_names += taxon_cols[1:] rows.append(vals) elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: + if options.peptide_match == 'report': + column_order.insert(1, 'tryptic_match') (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) column_names = taxon_cols rows = list(taxa.values()) - for peptide, vals in taxa.items(): - rows.append(vals) if options.tsv: with open(options.tsv, 'w') as outputFile: if column_names:
--- a/unipept.xml Tue Jun 02 10:30:01 2020 -0400 +++ b/unipept.xml Tue Apr 06 16:13:57 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="unipept" name="Unipept" version="4.3.0"> +<tool id="unipept" name="Unipept" version="4.5.0"> <description>retrieve taxonomy for peptides</description> <macros> <xml name="equate_il"> @@ -76,6 +76,10 @@ #elif str($peptide_src.fmt) == 'pepxml': --pepxml="$peptide_src.input_pepxml" #end if + --peptide_match $peptide_match + #if $peptide_match == 'report' and $unmatched_aa != 'default': + --unmatched_aa $unmatched_aa + #end if #if 'json' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2lca', 'pept2taxa', 'peptinfo']: --json $output_json #end if @@ -201,6 +205,16 @@ <param name="input_pepxml" type="data" format="pepxml" label="mzIndetML Input" /> </when> </conditional> + <param name="peptide_match" type="select" label="Match input peptides by"> + <option value="full">Match to the full input peptide</option> + <option value="best" selected="true">Best match to tryptic parts of the input peptide</option> + <option value="report">Best match to tryptic parts, add tryptic_match column</option> + </param> + <param name="unmatched_aa" type="select" label="Show tryptic_match as:"> + <option value="default">List of matched tryptic parts</option> + <option value=".">peptide with unmatched AAs as .</option> + <option value="x">peptide with unmatched AAs as x</option> + </param> <param name="selected_outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs"> <option value="tsv" selected="true">Tabular with one line per peptide</option> <option value="csv">Comma Separated Values (.csv) with one line per peptide</option> @@ -272,6 +286,7 @@ </data> </outputs> <tests> + <!-- Test-1 --> <test> <param name="api" value="pept2lca"/> <param name="fmt" value="tabular"/> @@ -291,6 +306,7 @@ </assert_contents> </output> </test> + <!-- Test-2 --> <test> <param name="api" value="pept2lca"/> <param name="fmt" value="fasta"/> @@ -299,8 +315,6 @@ <param name="extra" value="True"/> <param name="names" value="True"/> <param name="selected_outputs" value="json,tsv"/> -<!-- ---> <output name="output_json"> <assert_contents> <has_text text="VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKGSNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV" /> @@ -313,6 +327,7 @@ </assert_contents> </output> </test> + <!-- Test-3 --> <test> <param name="api" value="pept2taxa"/> <param name="fmt" value="fasta"/> @@ -328,6 +343,7 @@ </assert_contents> </output> </test> + <!-- Test-4 --> <test> <param name="api" value="pept2funct"/> <param name="fmt" value="tabular"/> @@ -359,6 +375,74 @@ </assert_contents> </output> </test> + <!-- Test-5 --> + <test> + <param name="api" value="pept2funct"/> + <param name="fmt" value="tabular"/> + <param name="input_tsv" value="input.tsv"/> + <param name="column" value="2"/> + <param name="extra" value="True"/> + <param name="names" value="True"/> + <param name="peptide_match" value="full"/> + <param name="selected_outputs" value="tsv,ec_tsv,go_tsv,ipr_tsv,unmatched"/> + <output name="output_tsv"> + <assert_contents> + <has_text text="GO:0004802" /> + <has_text text="2.2.1.1" /> + <has_text text="IPR005475" /> + <has_text_matching expression="FAPLLEEYKAEDWVQK\t0" /> + </assert_contents> + </output> + <output name="output_ec_tsv"> + <assert_contents> + <has_text text="2.2.1.1" /> + </assert_contents> + </output> + <output name="output_go_tsv"> + <assert_contents> + <has_text text="GO:0004802" /> + </assert_contents> + </output> + <output name="output_ipr_tsv"> + <assert_contents> + <has_text text="IPR005475" /> + </assert_contents> + </output> + </test> + <!-- Test-6 --> + <test> + <param name="api" value="pept2funct"/> + <param name="fmt" value="tabular"/> + <param name="input_tsv" value="input.tsv"/> + <param name="column" value="2"/> + <param name="extra" value="True"/> + <param name="names" value="True"/> + <param name="peptide_match" value="report"/> + <param name="selected_outputs" value="tsv,ec_tsv,go_tsv,ipr_tsv,unmatched"/> + <output name="output_tsv"> + <assert_contents> + <has_text text="GO:0004802" /> + <has_text text="2.2.1.1" /> + <has_text text="IPR005475" /> + <has_text_matching expression="FAPLLEEYKAEDWVQK\tAEDWVQK\t1" /> + </assert_contents> + </output> + <output name="output_ec_tsv"> + <assert_contents> + <has_text text="2.2.1.1" /> + </assert_contents> + </output> + <output name="output_go_tsv"> + <assert_contents> + <has_text text="GO:0004802" /> + </assert_contents> + </output> + <output name="output_ipr_tsv"> + <assert_contents> + <has_text text="IPR005475" /> + </assert_contents> + </output> + </test> </tests> <help><![CDATA[ **Unipept**