Mercurial > repos > galaxyp > unipept
changeset 3:34758ab8aaa4 draft
Uploaded
author | galaxyp |
---|---|
date | Mon, 20 Feb 2017 10:32:03 -0500 (2017-02-20) |
parents | 503ab8a39006 |
children | 4953dcd7dd39 |
files | unipept.py unipept.xml |
diffstat | 2 files changed, 84 insertions(+), 30 deletions(-) [+] |
line wrap: on
line diff
--- a/unipept.py Wed Dec 07 16:44:07 2016 -0500 +++ b/unipept.py Mon Feb 20 10:32:03 2017 -0500 @@ -38,7 +38,7 @@ pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] def __main__(): - version = '1.1' + version = '2.0' pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' def read_tabular(filepath,col): @@ -120,6 +120,8 @@ parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) + parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) + # output fields parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) # Warn vs Error Flag @@ -174,19 +176,23 @@ partToPeps.setdefault(part,[]).append(peptide) trypticPeptides = partToPeps.keys() ## unipept - post_data = [] - if options.equate_il: - post_data.append(("equate_il","true")) - if options.names: - post_data.append(("extra","true")) - post_data.append(("names","true")) - elif options.extra: - post_data.append(("extra","true")) - post_data += [('input[]', x) for x in trypticPeptides] - headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} - url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept - req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) - unipept_resp = json.loads( urllib2.urlopen( req ).read() ) + unipept_resp = [] + idx = range(0,len(trypticPeptides),options.max_request) + idx.append(len(trypticPeptides)) + for i in range(len(idx)-1): + post_data = [] + if options.equate_il: + post_data.append(("equate_il","true")) + if options.names or options.json: + post_data.append(("extra","true")) + post_data.append(("names","true")) + elif options.extra or options.json: + post_data.append(("extra","true")) + post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] + headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} + url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept + req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) + unipept_resp += json.loads( urllib2.urlopen( req ).read() ) unmatched_peptides = [] peptideMatches = [] if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) @@ -244,8 +250,49 @@ if peptide in unmatched_peptides: outputFile.write("%s\n" % peptide) if options.json: - with open(options.json,'w') as outputFile: - outputFile.write(str(resp)) + if options.unipept == 'pept2prot': + with open(options.json,'w') as outputFile: + outputFile.write(str(resp)) + else: + found_keys = set() + for i,pdict in enumerate(resp): + found_keys |= set(pdict.keys()) + taxa_cols = [] + for col in pept2lca_extra_column_order[-1:0:-1]: + if col+'_id' in found_keys: + taxa_cols.append(col) + id_to_node = dict() + def get_node(id,name,rank,child,seq): + if id not in id_to_node: + data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } + node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } + id_to_node[id] = node + else: + node = id_to_node[id] + node['data']['count'] += 1 + if seq is not None and seq not in node['data']['sequences']: + node['data']['sequences'].append(seq) + if child is None: + node['data']['self_count'] += 1 + elif child['id'] not in node['kids']: + node['kids'].append(child['id']) + node['children'].append(child) + return node + root = get_node(1,'root','no rank',None,None) + for i,pdict in enumerate(resp): + sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) + seq = sequence + child = None + for col in taxa_cols: + col_id = col+'_id' + if col_id in pdict and pdict.get(col_id): + col_name = col if col in found_keys else col+'_name' + child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) + seq = None + if child: + get_node(1,'root','no rank',child,None) + with open(options.json,'w') as outputFile: + outputFile.write(json.dumps(root)) if options.tsv or options.csv: # 'pept2lca','pept2taxa','pept2prot' found_keys = set()
--- a/unipept.xml Wed Dec 07 16:44:07 2016 -0500 +++ b/unipept.xml Mon Feb 20 10:32:03 2017 -0500 @@ -1,33 +1,38 @@ -<tool id="unipept" name="Unipept" version="1.1.0"> +<tool id="unipept" name="Unipept" version="2.0.1"> <description>retrieve taxonomy for peptides</description> <macros> <xml name="equate_il"> <param name="equate_il" type="boolean" truevalue="-e" falsevalue="" checked="true" label="Equate isoleucine and leucine"> <help>isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records</help> - </param > + </param> </xml> <xml name="extra"> <param name="extra" type="boolean" truevalue="-x" falsevalue="" checked="false" label="retrieve extra information"> <yield/> - </param > + </param> + </xml> + <xml name="extra_true"> + <param name="extra" type="boolean" truevalue="-x" falsevalue="" checked="true" label="retrieve extra information"> + <yield/> + </param> </xml> <xml name="names"> <param name="names" type="boolean" truevalue="-n" falsevalue="" checked="true" label="names" > <help>return the names in complete taxonomic lineage</help> - </param > + </param> <param name="allfields" type="boolean" truevalue="-A" falsevalue="" checked="false" label="allfields" > <help>include fields for most specific taxonomic classification: taxon_rank,taxon_id,taxon_name before lineage</help> - </param > + </param> </xml> </macros> <requirements> - <requirement type="package" version="2.7.12">python</requirement> + <requirement type="package" version="2.7">python</requirement> </requirements> <stdio> <exit_code range="1:" /> </stdio> - <command interpreter="python"><![CDATA[ - unipept.py + <command><![CDATA[ + python '$__tool_directory__/unipept.py' --api=$unipept.api $unipept.equate_il $unipept.extra #if $unipept.api != 'pept2prot': @@ -53,7 +58,7 @@ #elif str($peptide_src.fmt) == 'pepxml': --pepxml="$peptide_src.input_pepxml" #end if - #if 'json' in str($outputs).split(','): + #if 'json' in str($outputs).split(',') and str($unipept.api) != 'pept2prot': --json $output_json #end if #if 'tsv' in str($outputs).split(','): @@ -82,8 +87,7 @@ </when> <when value="pept2taxa"> <expand macro="equate_il" /> - <expand macro="extra"> - <checked>true</checked> + <expand macro="extra_true"> <help>Return the complete lineage of each organism, and include ID fields.</help> </expand> <expand macro="names" /> @@ -125,14 +129,17 @@ <param name="outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs"> <option value="tsv" selected="true">tabular</option> <option value="csv">Comma Separated Values (.csv)</option> - <option value="json">JSON</option> + <option value="json">JSON Taxomony Tree (ignored for pept2prot)</option> <option value="unmatched">Unmatched peptides</option> </param> <param name="strict" type="boolean" truevalue="--strict" falsevalue="" checked="false" label="Exit with error on invalid peptides, otherwise ignore them"/> </inputs> <outputs> - <data name="output_json" format="json" label="${tool.name} ${unipept.api} on ${on_string} json"> - <filter>'json' in outputs</filter> + <data name="output_json" format="d3_hierarchy" label="${tool.name} ${unipept.api} on ${on_string} json"> + <filter>'json' in outputs and unipept['api'] != 'pept2prot'</filter> + <change_format> + <when input="api" value="pept2prot" format="json" /> + </change_format> </data> <data name="output_tsv" format="tabular" label="${tool.name} ${unipept.api} on ${on_string} tsv"> <filter>'tsv' in outputs</filter>