Mercurial > repos > galaxyp > unipept
changeset 4:4953dcd7dd39 draft
planemo upload for repository http://unipept.ugent.be/apidocs commit e91b0fe16bf468b34884508652359b91847d1f95-dirty
author | galaxyp |
---|---|
date | Wed, 23 Jan 2019 09:16:38 -0500 |
parents | 34758ab8aaa4 |
children | 917fd3ebc223 |
files | unipept.py unipept.xml |
diffstat | 2 files changed, 593 insertions(+), 110 deletions(-) [+] |
line wrap: on
line diff
--- a/unipept.py Mon Feb 20 10:32:03 2017 -0500 +++ b/unipept.py Wed Jan 23 09:16:38 2019 -0500 @@ -21,6 +21,18 @@ import re import urllib import urllib2 + +""" +pept2taxa json +pept2lca json +pept2prot +pept2ec ecjson ec +pept2go go +pept2funct go ec +peptinfo json ecjson ec go + +""" + try: import xml.etree.cElementTree as ET except ImportError: @@ -31,11 +43,87 @@ if exit_code: sys.exit(exit_code) +go_types = ['biological process', 'molecular function', 'cellular component'] +ec_name_dict = { +'1' : 'Oxidoreductase', +'1.1' : 'act on the CH-OH group of donors', +'1.2' : 'act on the aldehyde or oxo group of donors', +'1.3' : 'act on the CH-CH group of donors', +'1.4' : 'act on the CH-NH2 group of donors', +'1.5' : 'act on CH-NH group of donors', +'1.6' : 'act on NADH or NADPH', +'1.7' : 'act on other nitrogenous compounds as donors', +'1.8' : 'act on a sulfur group of donors', +'1.9' : 'act on a heme group of donors', +'1.10' : 'act on diphenols and related substances as donors', +'1.11' : 'act on peroxide as an acceptor -- peroxidases', +'1.12' : 'act on hydrogen as a donor', +'1.13' : 'act on single donors with incorporation of molecular oxygen', +'1.14' : 'act on paired donors with incorporation of molecular oxygen', +'1.15' : 'act on superoxide radicals as acceptors', +'1.16' : 'oxidize metal ions', +'1.17' : 'act on CH or CH2 groups', +'1.18' : 'act on iron-sulfur proteins as donors', +'1.19' : 'act on reduced flavodoxin as donor', +'1.20' : 'act on phosphorus or arsenic as donors', +'1.21' : 'act on X-H and Y-H to form an X-Y bond', +'1.97' : 'other oxidoreductases', +'2' : 'Transferase', +'2.1' : 'transfer one-carbon groups, Methylase', +'2.2' : 'transfer aldehyde or ketone groups', +'2.3' : 'acyltransferases', +'2.4' : 'glycosyltransferases', +'2.5' : 'transfer alkyl or aryl groups, other than methyl groups', +'2.6' : 'transfer nitrogenous groups', +'2.7' : 'transfer phosphorus-containing groups', +'2.8' : 'transfer sulfur-containing groups', +'2.9' : 'transfer selenium-containing groups', +'3' : 'Hydrolase', +'3.1' : 'act on ester bonds', +'3.2' : 'act on sugars - glycosylases', +'3.3' : 'act on ether bonds', +'3.4' : 'act on peptide bonds - Peptidase', +'3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds', +'3.6' : 'act on acid anhydrides', +'3.7' : 'act on carbon-carbon bonds', +'3.8' : 'act on halide bonds', +'3.9' : 'act on phosphorus-nitrogen bonds', +'3.10' : 'act on sulfur-nitrogen bonds', +'3.11' : 'act on carbon-phosphorus bonds', +'3.12' : 'act on sulfur-sulfur bonds', +'3.13' : 'act on carbon-sulfur bonds', +'4' : 'Lyase', +'4.1' : 'carbon-carbon lyases', +'4.2' : 'carbon-oxygen lyases', +'4.3' : 'carbon-nitrogen lyases', +'4.4' : 'carbon-sulfur lyases', +'4.5' : 'carbon-halide lyases', +'4.6' : 'phosphorus-oxygen lyases', +'5' : 'Isomerase', +'5.1' : 'racemases and epimerases', +'5.2' : 'cis-trans-isomerases', +'5.3' : 'intramolecular oxidoreductases', +'5.4' : 'intramolecular transferases -- mutases', +'5.5' : 'intramolecular lyases', +'5.99' : 'other isomerases', +'6' : 'Ligase', +'6.1' : 'form carbon-oxygen bonds', +'6.2' : 'form carbon-sulfur bonds', +'6.3' : 'form carbon-nitrogen bonds', +'6.4' : 'form carbon-carbon bonds', +'6.5' : 'form phosphoric ester bonds', +'6.6' : 'form nitrogen-metal bonds', +} pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] +pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] +pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] +pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] +pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] +pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go'] def __main__(): version = '2.0' @@ -95,7 +183,7 @@ return None elif len(matches) == 1: return matches[0].copy() - else: + elif 'taxon_rank' in matches[0]: # find the most specific match (peptide is always the first column order field) for col in reversed(pept2lca_extra_column_order[1:]): col_id = col+"_id" if options.extra else col @@ -104,12 +192,229 @@ return match.copy() if col_id in match and match[col_id]: return match.copy() + else: + return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy() return None + def get_taxon_json(resp): + found_keys = set() + for i,pdict in enumerate(resp): + found_keys |= set(pdict.keys()) + taxa_cols = [] + for col in pept2lca_extra_column_order[-1:0:-1]: + if col+'_id' in found_keys: + taxa_cols.append(col) + id_to_node = dict() + def get_node(id,name,rank,child,seq): + if id not in id_to_node: + data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } + node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } + id_to_node[id] = node + else: + node = id_to_node[id] + node['data']['count'] += 1 + if seq is not None and seq not in node['data']['sequences']: + node['data']['sequences'].append(seq) + if child is None: + node['data']['self_count'] += 1 + elif child['id'] not in node['kids']: + node['kids'].append(child['id']) + node['children'].append(child) + return node + root = get_node(1,'root','no rank',None,None) + for i,pdict in enumerate(resp): + sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) + seq = sequence + child = None + for col in taxa_cols: + col_id = col+'_id' + if col_id in pdict and pdict.get(col_id): + col_name = col if col in found_keys else col+'_name' + child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) + seq = None + if child: + get_node(1,'root','no rank',child,None) + return root + + def get_ec_json(resp): + ecMap = dict() + for pdict in resp: + if 'ec' in pdict: + for ec in pdict['ec']: + ec_number = ec['ec_number'] + if ec_number not in ecMap: + ecMap[ec_number] = [] + ecMap[ec_number].append(pdict) + def get_ids(ec): + ids = [] + i = len(ec) + while i >= 0: + ids.append(ec[:i]) + i = ec.rfind('.',0,i - 1) + return ids + id_to_node = dict() + def get_node(id,name,child,seq): + if id not in id_to_node: + data = {'count' : 0, 'self_count' : 0, 'sequences' : [] } + node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } + id_to_node[id] = node + else: + node = id_to_node[id] + node['data']['count'] += 1 + if seq is not None and seq not in node['data']['sequences']: + node['data']['sequences'].append(seq) + if child is None: + node['data']['self_count'] += 1 + elif child['id'] not in node['kids']: + node['kids'].append(child['id']) + node['children'].append(child) + return node + root = get_node(0,'-.-.-.-',None,None) + for i in range(1,7): + child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None) + get_node(0,'-.-.-.-',child,None) + for i,pdict in enumerate(resp): + sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) + seq = sequence + if 'ec' in pdict: + for ec in pdict['ec']: + child = None + protein_count = ec['protein_count'] + ec_number = ec['ec_number'] + for ec_id in get_ids(ec_number): + ec_name = str(ec_id) + ## if len(ec_id) == 3: + ## ec_name = '%s\n%s\n%s' %(str(ec_id), ec_name_dict[str(ec_id[0])], ec_name_dict[str(ec_id)]) + child = get_node(ec_id,ec_name,child,seq) + seq = None + if child: + get_node(0,'-.-.-.-',child,None) + return root + + def get_taxon_dict(resp, column_order, extra=False, names=False): + found_keys = set() + results = [] + for i,pdict in enumerate(resp): + results.append(pdict) + found_keys |= set(pdict.keys()) + # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) + column_names = [] + column_keys = [] + for col in column_order: + if col in found_keys: + column_names.append(col) + column_keys.append(col) + elif names: + col_id = col+'_id' + col_name = col+'_name' + if extra: + if col_id in found_keys: + column_names.append(col_id) + column_keys.append(col_id) + if names: + if col_name in found_keys: + column_names.append(col) + column_keys.append(col_name) + else: + if col+'_name' in found_keys: + column_names.append(col) + column_keys.append(col+'_name') + elif col+'_id' in found_keys: + column_names.append(col) + column_keys.append(col+'_id') + # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) + taxa = dict() ## peptide : [taxonomy] + for i,pdict in enumerate(results): + peptide = pdict['peptide'] if 'peptide' in pdict else None + if peptide and peptide not in taxa: + vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] + taxa[peptide] = vals + return (taxa,column_names) + + def get_ec_dict(resp, extra=False): + ec_cols = ['ec_numbers', 'ec_protein_counts'] + if extra: + ec_cols.append('ec_names') + ec_dict = dict() + for i,pdict in enumerate(resp): + peptide = pdict['peptide'] + ec_numbers = [] + protein_counts = [] + ec_names = [] + if 'ec' in pdict: + for ec in pdict['ec']: + ec_numbers.append(ec['ec_number']) + protein_counts.append(str(ec['protein_count'])) + if extra: + ec_names.append(ec['name'] if 'name' in ec else '') + vals = [','.join(ec_numbers),','.join(protein_counts)] + if extra: + vals.append(','.join(ec_names)) + ec_dict[peptide] = vals + return (ec_dict, ec_cols) + + def get_go_dict(resp, extra=False): + go_cols = ['go_terms', 'go_protein_counts'] + if extra: + go_cols.append('go_names') + go_dict = dict() + for i,pdict in enumerate(resp): + peptide = pdict['peptide'] + go_terms = [] + protein_counts = [] + go_names = [] + if 'go' in pdict: + for go in pdict['go']: + if 'go_term' in go: + go_terms.append(go['go_term']) + protein_counts.append(str(go['protein_count'])) + if extra: + go_names.append(go['name'] if 'name' in go else '') + else: + for go_type in go_types: + if go_type in go: + for _go in go[go_type]: + go_terms.append(_go['go_term']) + protein_counts.append(str(_go['protein_count'])) + if extra: + go_names.append(_go['name'] if 'name' in _go else '') + vals = [','.join(go_terms),','.join(protein_counts)] + if extra: + vals.append(','.join(go_names)) + go_dict[peptide] = vals + return (go_dict, go_cols) + + def write_ec_table(outfile, resp, column_order): + with open(outfile,'w') as fh: + for i,pdict in enumerate(resp): + if 'ec' in pdict: + tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] + for ec in pdict['ec']: + vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] + fh.write('%s\n' % '\t'.join(tvals + vals)) + + def write_go_table(outfile, resp, column_order): + with open(outfile,'w') as fh: + for i,pdict in enumerate(resp): + if 'go' in pdict: + tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] + for go in pdict['go']: + if 'go_term' in go: + vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] + fh.write('%s\n' % '\t'.join(tvals + vals)) + else: + for go_type in go_types: + if go_type in go: + for _go in go[go_type]: + vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] + vals.append(go_type) + fh.write('%s\n' % '\t'.join(tvals + vals)) + #Parse Command Line parser = optparse.OptionParser() # unipept API choice - parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) + parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], + help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' ) # input files parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) @@ -120,6 +425,7 @@ parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) + parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' ) parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) # output fields @@ -128,10 +434,16 @@ parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) # output files parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') + parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') + parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') + parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') + parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) + parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' ) # debug + parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' ) parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) (options, args) = parser.parse_args() @@ -188,15 +500,32 @@ post_data.append(("names","true")) elif options.extra or options.json: post_data.append(("extra","true")) + if options.domains: + post_data.append(("domains","true")) post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] + if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data)) headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} - url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept - req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) - unipept_resp += json.loads( urllib2.urlopen( req ).read() ) + ## headers = {'Accept': 'application/json'} + url = '%s/%s' % (options.url.rstrip('/'),options.unipept) + if options.get: + params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data]) + url = '%s.json?%s' % (url,params) + req = urllib2.Request( url ) + else: + url = '%s.json' % (url) + req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) + if options.debug: print >> sys.stdout, "url: %s\n" % (str(url)) + try: + resp = urllib2.urlopen( req ) + if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode())) + if resp.getcode() == 200: + unipept_resp += json.loads( urllib2.urlopen( req ).read() ) + except Exception, e: + warn_err('HTTP Error %s\n' % (str(e)),exit_code=None) unmatched_peptides = [] peptideMatches = [] if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) - if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': + if options.unipept in ['pept2prot', 'pept2taxa']: dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide ## multiple entries per trypticPeptide for pep2prot or pep2taxa mapping = {} @@ -222,7 +551,7 @@ match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) - else: + elif options.unipept in ['pept2lca', 'peptinfo']: ## should be one response per trypticPeptide for pep2lca respMap = {v['peptide']:v for v in unipept_resp} ## map resp back to peptides @@ -239,6 +568,22 @@ match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) + else: + respMap = {v['peptide']:v for v in unipept_resp} + ## map resp back to peptides + for peptide in peptides: + matches = list() + for part in pepToParts[peptide]: + if part in respMap and 'total_protein_count' in respMap[part]: + matches.append(respMap[part]) + match = best_match(peptide,matches) + if not match: + unmatched_peptides.append(peptide) + longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] + match = {'peptide' : longest_tryptic_peptide} + match['tryptic_peptide'] = match['peptide'] + match['peptide'] = peptide + peptideMatches.append(match) resp = peptideMatches if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) ## output results @@ -250,96 +595,69 @@ if peptide in unmatched_peptides: outputFile.write("%s\n" % peptide) if options.json: - if options.unipept == 'pept2prot': - with open(options.json,'w') as outputFile: - outputFile.write(str(resp)) - else: - found_keys = set() - for i,pdict in enumerate(resp): - found_keys |= set(pdict.keys()) - taxa_cols = [] - for col in pept2lca_extra_column_order[-1:0:-1]: - if col+'_id' in found_keys: - taxa_cols.append(col) - id_to_node = dict() - def get_node(id,name,rank,child,seq): - if id not in id_to_node: - data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } - node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } - id_to_node[id] = node - else: - node = id_to_node[id] - node['data']['count'] += 1 - if seq is not None and seq not in node['data']['sequences']: - node['data']['sequences'].append(seq) - if child is None: - node['data']['self_count'] += 1 - elif child['id'] not in node['kids']: - node['kids'].append(child['id']) - node['children'].append(child) - return node - root = get_node(1,'root','no rank',None,None) - for i,pdict in enumerate(resp): - sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) - seq = sequence - child = None - for col in taxa_cols: - col_id = col+'_id' - if col_id in pdict and pdict.get(col_id): - col_name = col if col in found_keys else col+'_name' - child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) - seq = None - if child: - get_node(1,'root','no rank',child,None) + if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: + root = get_taxon_json(resp) with open(options.json,'w') as outputFile: outputFile.write(json.dumps(root)) + elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']: + with open(options.json,'w') as outputFile: + outputFile.write(str(resp)) + if options.ec_json: + if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: + root = get_ec_json(resp) + with open(options.ec_json,'w') as outputFile: + outputFile.write(json.dumps(root)) if options.tsv or options.csv: - # 'pept2lca','pept2taxa','pept2prot' - found_keys = set() - results = [] - for i,pdict in enumerate(resp): - results.append(pdict) - found_keys |= set(pdict.keys()) - # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) - column_names = [] - column_keys = [] - for col in column_order: - if col in found_keys: - column_names.append(col) - column_keys.append(col) - elif options.extra or options.names: - col_id = col+'_id' - col_name = col+'_name' - if options.extra: - if col_id in found_keys: - column_names.append(col_id) - column_keys.append(col_id) - if options.names: - if col_name in found_keys: - column_names.append(col) - column_keys.append(col_name) - else: - if col+'_name' in found_keys: - column_names.append(col) - column_keys.append(col+'_name') - elif col+'_id' in found_keys: - column_names.append(col) - column_keys.append(col+'_id') - # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) - taxa = [] - for i,pdict in enumerate(results): - vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] - if vals not in taxa: - taxa.append(vals) + rows = [] + column_names = None + if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']: + taxa = None + ec_dict = None + go_dict = None + if options.unipept in ['peptinfo']: + (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) + if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: + (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra) + if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: + (go_dict,go_cols) = get_go_dict(resp, extra=options.extra) + for i,pdict in enumerate(resp): + peptide = pdict['peptide'] + total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' + column_names = ['peptide', 'total_protein_count'] + vals = [peptide,total_protein_count] + if ec_dict: + vals += ec_dict[peptide] + column_names += ec_cols + if go_dict: + vals += go_dict[peptide] + column_names += go_cols + if taxa: + vals += taxa[peptide][1:] + column_names += taxon_cols[1:] + rows.append(vals) + elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: + (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) + column_names = taxon_cols + rows = taxa.values() + for peptide,vals in taxa.iteritems(): + rows.append(vals) if options.tsv: with open(options.tsv,'w') as outputFile: - outputFile.write("#%s\n"% '\t'.join(column_names)) - for vals in taxa: + if column_names: + outputFile.write("#%s\n"% '\t'.join(column_names)) + for vals in rows: outputFile.write("%s\n"% '\t'.join(vals)) if options.csv: with open(options.csv,'w') as outputFile: - outputFile.write("%s\n"% ','.join(column_names)) - for vals in taxa: + if column_names: + outputFile.write("%s\n"% ','.join(column_names)) + for vals in rows: outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) + if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: + column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order + write_ec_table(options.ec_tsv, resp, column_order) + if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: + column_order = pept2go_extra_column_order if options.extra else pept2go_column_order + write_go_table(options.go_tsv, resp, column_order) if __name__ == "__main__" : __main__()
--- a/unipept.xml Mon Feb 20 10:32:03 2017 -0500 +++ b/unipept.xml Wed Jan 23 09:16:38 2019 -0500 @@ -1,4 +1,4 @@ -<tool id="unipept" name="Unipept" version="2.0.1"> +<tool id="unipept" name="Unipept" version="4.0.0"> <description>retrieve taxonomy for peptides</description> <macros> <xml name="equate_il"> @@ -24,6 +24,20 @@ <help>include fields for most specific taxonomic classification: taxon_rank,taxon_id,taxon_name before lineage</help> </param> </xml> + <xml name="domains"> + <param name="domains" type="boolean" truevalue="-D" falsevalue="" checked="false" label="group responses by GO namespace (biological process, molecular function, cellular component)"> + <yield/> + </param> + </xml> + <xml name="selected_outputs"> + <param name="selected_outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs"> + <option value="tsv" selected="true">Tabular with one line per peptide</option> + <option value="csv">Comma Separated Values (.csv) with one line per peptide</option> + <option value="json">JSON Taxomony Tree (for pept2lca, pep2taxa, and peptinfo)</option> + <yield/> + <option value="unmatched">Unmatched peptides</option> + </param> + </xml> </macros> <requirements> <requirement type="package" version="2.7">python</requirement> @@ -33,11 +47,15 @@ </stdio> <command><![CDATA[ python '$__tool_directory__/unipept.py' + ## --url 'http://morty.ugent.be/api/v1' -g -M 1 --api=$unipept.api $unipept.equate_il $unipept.extra - #if $unipept.api != 'pept2prot': + #if $unipept.api in ['pept2lca', 'pept2taxa', 'peptinfo']: $unipept.names $unipept.allfields #end if + #if $unipept.api in ['pept2go', 'pept2funct', 'peptinfo']: + $unipept.domains + #end if $strict #if str($peptide_src.fmt) == 'proteomic': #if $peptide_src.input.datatype.file_ext == 'fasta': @@ -58,16 +76,25 @@ #elif str($peptide_src.fmt) == 'pepxml': --pepxml="$peptide_src.input_pepxml" #end if - #if 'json' in str($outputs).split(',') and str($unipept.api) != 'pept2prot': + #if 'json' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2lca', 'pept2taxa', 'peptinfo']: --json $output_json #end if - #if 'tsv' in str($outputs).split(','): + #if 'ec_json' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2ec', 'pept2funct', 'peptinfo']: + --ec_json $output_ec_json + #end if + #if 'tsv' in str($selected_outputs).split(','): --tsv $output_tsv #end if - #if 'csv' in str($outputs).split(','): + #if 'csv' in str($selected_outputs).split(','): --csv $output_csv #end if - #if 'unmatched' in str($outputs).split(','): + #if 'ec_tsv' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2ec', 'pept2funct', 'peptinfo']: + --ec_tsv $output_ec_tsv + #end if + #if 'go_tsv' in str($selected_outputs).split(',') and str($unipept.api) in ['pept2go', 'pept2funct', 'peptinfo']: + --go_tsv $output_go_tsv + #end if + #if 'unmatched' in str($selected_outputs).split(','): --unmatched $output_unmatched #end if ]]></command> @@ -77,6 +104,10 @@ <option value="pept2lca" selected="true">pept2lca: lowest common ancestor</option> <option value="pept2taxa">pept2taxa: organisms associated with the UniProt entries containing a given tryptic peptide</option> <option value="pept2prot">pept2prot: UniProt entries containing a given tryptic peptide</option> + <option value="pept2ec">pept2ec: Tryptic peptides and associated EC terms</option> + <option value="pept2go">pept2go: Tryptic peptides and associated GO terms</option> + <option value="pept2funct">pept2funct: Tryptic peptides and associated EC and GO terms</option> + <option value="peptinfo">peptinfo: Tryptic peptides and associated EC and GO terms and lowest common ancestor taxonomy</option> </param> <when value="pept2lca"> <expand macro="equate_il" /> @@ -100,6 +131,38 @@ </help> </expand> </when> + <when value="pept2ec"> + <expand macro="equate_il" /> + <expand macro="extra_true"> + <help>Return the name of the EC-number. + </help> + </expand> + </when> + <when value="pept2go"> + <expand macro="equate_il" /> + <expand macro="extra_true"> + <help>Return the name of the GO-term. + </help> + </expand> + <expand macro="domains" /> + </when> + <when value="pept2funct"> + <expand macro="equate_il" /> + <expand macro="extra_true"> + <help>Return the name of the EC-number and GO-term. + </help> + </expand> + <expand macro="domains" /> + </when> + <when value="peptinfo"> + <expand macro="equate_il" /> + <expand macro="extra_true"> + <help>Return the name of the EC-number and GO-term. + </help> + </expand> + <expand macro="domains" /> + <expand macro="names" /> + </when> </conditional> <conditional name="peptide_src"> <param name="fmt" type="select" label="Peptides input format" > @@ -126,29 +189,66 @@ <param name="input_pepxml" type="data" format="pepxml" label="mzIndetML Input" /> </when> </conditional> - <param name="outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs"> - <option value="tsv" selected="true">tabular</option> - <option value="csv">Comma Separated Values (.csv)</option> - <option value="json">JSON Taxomony Tree (ignored for pept2prot)</option> + <param name="selected_outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs"> + <option value="tsv" selected="true">Tabular with one line per peptide</option> + <option value="csv">Comma Separated Values (.csv) with one line per peptide</option> + <option value="json">JSON Taxomony Tree (for pept2lca, pep2taxa, and peptinfo)</option> + <option value="go_tsv">Peptide GO terms in normalized tabular (for pept2go, pept2funct, and peptinfo)</option> + <option value="ec_tsv">Peptide EC terms in normalized tabular (for pept2ec, pept2funct, and peptinfo)</option> + <option value="ec_json">JSON EC Coverage Tree (for pept2ec, pep2funct, and peptinfo)</option> <option value="unmatched">Unmatched peptides</option> </param> <param name="strict" type="boolean" truevalue="--strict" falsevalue="" checked="false" label="Exit with error on invalid peptides, otherwise ignore them"/> </inputs> <outputs> - <data name="output_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} json"> - <filter>'json' in outputs and unipept['api'] != 'pept2prot'</filter> + <data name="output_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} Taxonomy json"> + <filter>'json' in selected_outputs and unipept['api'] in ('pept2lca', 'pept2taxa', 'peptinfo')</filter> <change_format> <when input="api" value="pept2prot" format="json" /> </change_format> </data> + <data name="output_ec_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} EC json"> + <filter>'ec_json' in selected_outputs and unipept['api'] in ('pept2ec', 'pept2funct', 'peptinfo')</filter> + </data> <data name="output_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} tsv"> - <filter>'tsv' in outputs</filter> + <filter>'tsv' in selected_outputs</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <!-- + <conditional name="unipept.api"> + <when value="pept2funct"> + <action name="column_names" type="metadata" default="peptide,total_protein_count,ec_numbers,ec_protein_counts,ec_names,go_terms,go_protein_counts,go_names" /> + </when> + <when value="pept2go"> + <action name="column_names" type="metadata" default="peptide,total_protein_count,go_terms,go_protein_counts,go_names" /> + </when> + <when value="pept2ec"> + <action name="column_names" type="metadata" default="peptide,total_protein_count,ec_numbers,ec_protein_counts,ec_names" /> + </when> + </conditional> + --> + </actions> </data> <data name="output_csv" format="csv" label="${tool.name} ${unipept.api} on ${on_string} csv"> - <filter>'csv' in outputs</filter> + <filter>'csv' in selected_outputs</filter> + </data> + <data name="output_ec_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} EC tsv"> + <filter>'ec_tsv' in selected_outputs and unipept['api'] in ('pept2ec', 'pept2funct', 'peptinfo')</filter> + <actions> + <action name="column_names" type="metadata" default="Peptide,Total Protein Count,EC Number,Protein Count,EC Name" /> + </actions> + </data> + <data name="output_go_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} GO tsv"> + <filter>'go_tsv' in selected_outputs and unipept['api'] in ('pept2go', 'pept2funct', 'peptinfo')</filter> + <actions> + <action name="column_names" type="metadata" default="Peptide,Total Protein Count,GO Term,Protein Count,GO Name" /> + </actions> </data> <data name="output_unmatched" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} unmatched"> - <filter>'unmatched' in outputs</filter> + <filter>'unmatched' in selected_outputs</filter> + <actions> + <action name="column_names" type="metadata" default="Unmatched Peptide" /> + </actions> </data> </outputs> <tests> @@ -159,7 +259,7 @@ <param name="column" value="2"/> <param name="extra" value="True"/> <param name="names" value="True"/> - <param name="outputs" value="tsv,unmatched"/> + <param name="selected_outputs" value="tsv,unmatched"/> <output name="output_tsv"> <assert_contents> <has_text text="Homininae" /> @@ -178,7 +278,7 @@ <param name="equate_il" value="True"/> <param name="extra" value="True"/> <param name="names" value="True"/> - <param name="outputs" value="json,tsv"/> + <param name="selected_outputs" value="json,tsv"/> <output name="output_json"> <assert_contents> <has_text text="VMDVNDHKPEFYNCSLPACTFTPEEAQVNFTGYVDEHASPHIPIDDLTMVVYDPDKGSNGTFLLSLGGPDAEAFSVSPERAAGSASVQVLVRVSALVDYERQTAMAV" /> @@ -187,7 +287,7 @@ <output name="output_tsv"> <assert_contents> <has_text text="9606" /> - <has_text text="9598" /> + <has_text text="9596" /> </assert_contents> </output> </test> @@ -198,13 +298,37 @@ <param name="equate_il" value="True"/> <param name="extra" value="False"/> <param name="names" value="False"/> - <param name="outputs" value="tsv"/> + <param name="selected_outputs" value="tsv"/> <output name="output_tsv"> <assert_contents> <has_text text="sapiens" /> - <has_text text="troglodytes" /> + <has_text text="paniscus" /> <has_text text="Gorilla" /> - <has_text text="Macaca" /> + </assert_contents> + </output> + </test> + <test> + <param name="api" value="pept2funct"/> + <param name="fmt" value="tabular"/> + <param name="input_tsv" value="input.tsv"/> + <param name="column" value="2"/> + <param name="extra" value="True"/> + <param name="names" value="True"/> + <param name="selected_outputs" value="tsv,ec_tsv,go_tsv,unmatched"/> + <output name="output_tsv"> + <assert_contents> + <has_text text="GO:0004802" /> + <has_text text="2.2.1.1" /> + </assert_contents> + </output> + <output name="output_ec_tsv"> + <assert_contents> + <has_text text="2.2.1.1" /> + </assert_contents> + </output> + <output name="output_go_tsv"> + <assert_contents> + <has_text text="GO:0004802" /> </assert_contents> </output> </test> @@ -335,6 +459,47 @@ varietas_id forma_id + **pept2ec** - http://unipept.ugent.be/apidocs/pept2ec + + Returns the functional EC-numbers associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface. + + By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy:: + + peptide: the peptide that matched this record + total_protein_count: Total amount of proteins matched with the given peptide + ec_number: EC-number associated with the current tryptic peptide. + protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current EC-number. + name: Optional, name of the EC-number. Included when the extra parameter is set to true. + + + **pept2go** - http://unipept.ugent.be/apidocs/pept2go + + Returns the functional GO-terms associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface. + + By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy:: + + peptide: the peptide that matched this record + total_protein_count: Total amount of proteins matched with the given peptide + go_term: The GO-term associated with the current tryptic peptide. + protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current GO-term. + name: Optional, name of the GO-term. Included when the extra parameter is set to true. + + + **pept2funct** - http://unipept.ugent.be/apidocs/pept2funct + + Returns the functional EC-numbers and GO-terms associated with a given tryptic peptide. This is the same information as provided when performing a search with the Tryptic Peptide Analysis in the web interface. + + By default, each object contains the following information fields extracted from the UniProt record and NCBI taxonomy:: + + peptide: the peptide that matched this record + total_protein_count: Total amount of proteins matched with the given peptide + ec_number: EC-number associated with the current tryptic peptide. + protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current EC-number. + name: Optional, name of the EC-number. Included when the extra parameter is set to true. + go_term: The GO-term associated with the current tryptic peptide. + protein_count: amount of proteins matched with the given tryptic peptide that are labeled with the current GO-term. + name: Optional, name of the GO-term. Included when the extra parameter is set to true. + **Attributions**