Next changeset 1:732a52c18758 (2018-10-30) |
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_entrez_eutils commit 15bcc5104c577b4b9c761f2854fc686c07ffa9db |
added:
README.rst __efetch_build_options.py ecitmatch.py ecitmatch.xml efetch.py egquery.py einfo.py elink.py epost.py esearch.py esummary.py eutils.py eutils.pyc macros.xml test-data/ecitmatch.results.tsv test-data/ecitmatch.tsv test-data/egquery.1.xml test-data/esearch.pubmed.2014-01-pnas.xml test-data/esearch.pubmed.xml test-data/esummary.tax.xml test-data/example.history.json test-data/pm-tax-neighbor.xml test-data/pubmed.metadata.xml test-data/viruses.tax.xml tool_dependencies.xml |
b |
diff -r 000000000000 -r 68cd8d564e0a README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,38 @@ +Galaxy NCBI Entrez Tools +======================== + +This repo requires a readme as administrators should very aware of some +restrictions NCBI places on the use of the Entrez service. + +NCBI requests that you please limit large jobs to either weekends or +between 9:00 PM and 5:00 AM Eastern time during weekdays. This is not a +request that the Galaxy tool can easily service, so we've included it in +the disclaimer on every tool quite prominently. + +Failure to comply with NCBI's policies may result in an block until +you/the user contacts NCBI and registers the tool ID and their email. + +Note that these are *IP* level blocks so the Galaxy tools uses a +concatenation of the administrator's emails, and the user email, in +hopes that NCBI will contact all relevant parties should their system be +abused. + +Additionally, since these are IP level blocks, the Galaxy tool author +(@erasche) recommends using the following ``jobs_conf.xml`` snippet in +order to place a system-wide restriction of 1 concurrent Entrez job +amongst all users. + +.. code:: xml + + <destination id="entrez" runner="local"> + </destination> + <limit type="concurrent_jobs" id="entrez">1</limit> + <tools> + <tool id="ncbi.eutils.efetch" destination="entrez" /> + <tool id="ncbi.eutils.esearch" destination="entrez" /> + <tool id="ncbi.eutils.epost" destination="entrez" /> + <tool id="ncbi.eutils.elink" destination="entrez" /> + <tool id="ncbi.eutils.einfo" destination="entrez" /> + <tool id="ncbi.eutils.esummary" destination="entrez" /> + </tools> + |
b |
diff -r 000000000000 -r 68cd8d564e0a __efetch_build_options.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/__efetch_build_options.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
b'@@ -0,0 +1,267 @@\n+#!/usr/bin/env python\n+# Daniel Blankenberg\n+# Creates the options for tool interface\n+import re\n+\n+# http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi\n+db_list = \'\'\'\n+<DbName>annotinfo</DbName>\n+<DbName>assembly</DbName>\n+<DbName>bioproject</DbName>\n+<DbName>biosample</DbName>\n+<DbName>biosystems</DbName>\n+<DbName>blastdbinfo</DbName>\n+<DbName>books</DbName>\n+<DbName>cdd</DbName>\n+<DbName>clinvar</DbName>\n+<DbName>clone</DbName>\n+<DbName>dbvar</DbName>\n+<DbName>gap</DbName>\n+<DbName>gapplus</DbName>\n+<DbName>gds</DbName>\n+<DbName>gencoll</DbName>\n+<DbName>gene</DbName>\n+<DbName>genome</DbName>\n+<DbName>geoprofiles</DbName>\n+<DbName>grasp</DbName>\n+<DbName>gtr</DbName>\n+<DbName>homologene</DbName>\n+<DbName>medgen</DbName>\n+<DbName>mesh</DbName>\n+<DbName>ncbisearch</DbName>\n+<DbName>nlmcatalog</DbName>\n+<DbName>nuccore</DbName>\n+<DbName>nucest</DbName>\n+<DbName>nucgss</DbName>\n+<DbName>nucleotide</DbName>\n+<DbName>omim</DbName>\n+<DbName>orgtrack</DbName>\n+<DbName>pcassay</DbName>\n+<DbName>pccompound</DbName>\n+<DbName>pcsubstance</DbName>\n+<DbName>pmc</DbName>\n+<DbName>popset</DbName>\n+<DbName>probe</DbName>\n+<DbName>protein</DbName>\n+<DbName>proteinclusters</DbName>\n+<DbName>pubmed</DbName>\n+<DbName>pubmedhealth</DbName>\n+<DbName>seqannot</DbName>\n+<DbName>snp</DbName>\n+<DbName>sra</DbName>\n+<DbName>structure</DbName>\n+<DbName>taxonomy</DbName>\n+<DbName>unigene</DbName>\'\'\'.replace( "<DbName>", "").replace( "</DbName>", "").split("\\n")\n+\n+\n+help = \'\'\' (all)\n+ docsum xml Document Summary\n+ docsum json Document Summary\n+ full text Full Document\n+ uilist xml Unique Identifier List\n+ uilist text Unique Identifier List\n+ full xml Full Document\n+\n+ bioproject\n+ native BioProject Report\n+ native xml RecordSet\n+\n+ biosample\n+ native BioSample Report\n+ native xml BioSampleSet\n+\n+ biosystems\n+ native xml Sys-set\n+\n+ gds\n+ native xml RecordSet\n+ summary text Summary\n+\n+ gene\n+ gene_table xml Gene Table\n+ native text Gene Report\n+ native asn.1 Entrezgene\n+ native xml Entrezgene-Set\n+ tabular tabular Tabular Report\n+\n+ homologene\n+ alignmentscores text Alignment Scores\n+ fasta fasta FASTA\n+ homologene text Homologene Report\n+ native text Homologene List\n+ native asn.1 HG-Entry\n+ native xml Entrez-Homologene-Set\n+\n+ mesh\n+ full text Full Record\n+ native text MeSH Report\n+ native xml RecordSet\n+\n+ nlmcatalog\n+ native text Full Record\n+ native xml NLMCatalogRecordSet\n+\n+ pmc\n+ medline text MEDLINE\n+ native xml pmc-articleset\n+\n+ pubmed\n+ abstract xml Abstract\n+ medline text MEDLINE\n+ native asn.1 Pubmed-entry\n+ native xml PubmedArticleSet\n+\n+ (sequences)\n+ acc text Accession Number\n+ est xml EST Report\n+ fasta fasta FASTA\n+ fasta xml TinySeq\n+ fasta_cds_aa '..b'SDSet\n+ gss text GSS Report\n+ ipg text Identical Protein Report\n+ ipg xml IPGReportSet\n+ native text Seq-entry\n+ native xml Bioseq-set\n+ seqid asn.1 Seq-id\n+\n+ snp\n+ chr text Chromosome Report\n+ docset text Summary\n+ fasta fasta FASTA\n+ flt text Flat File\n+ native asn.1 Rs\n+ native xml ExchangeSet\n+ rsr tabular RS Cluster Report\n+ ssexemplar text SS Exemplar List\n+\n+ sra\n+ native xml EXPERIMENT_PACKAGE_SET\n+ runinfo xml SraRunInfo\n+\n+ structure\n+ mmdb asn.1 Ncbi-mime-asn1 strucseq\n+ native text MMDB Report\n+ native xml RecordSet\n+\n+ taxonomy\n+ native text Taxonomy List\n+ native xml TaxaSet\'\'\'.split("\\n")\n+\n+\n+db = {}\n+for db_name in db_list:\n+ db[db_name] = []\n+\n+section = None\n+for line in help:\n+ line = re.split(\'\\s{2,}\', line.strip())\n+ # Ignore empties\n+ if len(line) == 0:\n+ continue\n+ # Section headers have one item\n+ elif len(line) == 1:\n+ section = line[0]\n+ db[section] = []\n+ # Format lines have 2+\n+ elif len(line) == 2:\n+ parent_format = line[0]\n+ description = line[1]\n+\n+ if parent_format not in db[section]:\n+ db[section].append((parent_format, None, description))\n+ elif len(line) == 3:\n+ parent_format = line[0]\n+ format_modifier = line[1]\n+ description = line[2]\n+\n+ if parent_format not in db[section]:\n+ db[section].append((parent_format, format_modifier, description))\n+\n+\n+all_formats = db[\'(all)\']\n+del db[\'(all)\']\n+sequences_formats = db[\'(sequences)\']\n+del db[\'(sequences)\']\n+del db[\'\']\n+\n+for key in db:\n+ db[key] += all_formats\n+\n+for key in (\'nuccore\', \'nucest\', \'nucgss\', \'nucleotide\'):\n+ db[key] += sequences_formats\n+\n+MACRO_TPL = \'\'\'\n+\n+\'\'\'\n+\n+WHEN_TPL = \'\'\' <when value="{format}">\n+ <param name="output_format" type="select" label="Output Format">\n+ {format_options}\n+ </param>\n+ </when>\'\'\'\n+\n+FORMAT_OPTION_TPL = \'\'\'<option value="{name_type}">{name_type_human}</option>\'\'\'\n+\n+format_names = {}\n+\n+print \'\'\' <xml name="db">\n+ <conditional name="db">\n+ <expand macro="dbselect" />\'\'\'\n+for key in sorted(db):\n+ format_options = []\n+\n+ for (parent_format, format_modifier, description) in sorted(db[key]):\n+ name_human = description\n+ if format_modifier:\n+ name_human += \' (%s)\' % format_modifier\n+ format_string = \'%s-%s\' % (parent_format, format_modifier)\n+\n+ format_options.append(FORMAT_OPTION_TPL.format(\n+ name_type=format_string,\n+ name_type_human=name_human,\n+ ))\n+\n+ format_names[format_string] = format_modifier\n+\n+ print WHEN_TPL.format(\n+ format=key,\n+ format_options=\'\\n \'.join(format_options)\n+ )\n+\n+print \'\'\' </conditional>\n+ </xml>\'\'\'\n+\n+CHANGE_FORMAT_TPL = \'\'\'\n+ <xml name="efetch_formats">\n+ <change_format>\n+ {formats}\n+ </change_format>\n+ </xml>\n+\'\'\'\n+\n+CHANGE_FORMAT_WHEN_TPL = \'\'\'<when input="output_format" value="{key}" format="{value}"/>\'\'\'\n+# Format options\n+\n+\n+whens = []\n+for (k, v) in format_names.items():\n+ if v is None:\n+ v = \'text\'\n+ elif v == \'asn.1\':\n+ v = \'asn1\'\n+\n+ whens.append(CHANGE_FORMAT_WHEN_TPL.format(\n+ key=k, value=v\n+ ))\n+\n+print CHANGE_FORMAT_TPL.format(formats=\'\\n \'.join(whens))\n' |
b |
diff -r 000000000000 -r 68cd8d564e0a ecitmatch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecitmatch.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,60 @@ +#!/usr/bin/env python +import argparse +import eutils + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ECitMatch', epilog='') + parser.add_argument('--file', type=argparse.FileType('r'), help='Tabular file containing citations to search') + + parser.add_argument('--key', nargs='*', help='Citation Key') + parser.add_argument('--journal_title', nargs='*', help='Journal Title') + parser.add_argument('--year', nargs='*', help='Year') + parser.add_argument('--volume', nargs='*', help='Volume') + parser.add_argument('--first_page', nargs='*', help='First Page') + parser.add_argument('--author_name', nargs='*', help='Author name') + + # Emails + parser.add_argument('--user_email', help="User email") + parser.add_argument('--admin_email', help="Admin email") + args = parser.parse_args() + + c = eutils.Client(user_email=args.user_email, admin_email=args.admin_email) + + citations = [] + if args.file is None: + for key, journal, year, volume, first_page, author_name in \ + zip(args.key, args.journal_title, args.year, args.volume, args.first_page, args.author_name): + citations.append({ + 'key': key, + 'journal': journal, + 'year': year, + 'volume': volume, + 'first_page': first_page, + 'author_name': author_name, + }) + else: + for line in args.file: + line = line.strip() + if not line.startswith('#'): + tmp = line.split('\t') + try: + citations.append({ + 'journal': tmp[0], + 'year': tmp[1], + 'volume': tmp[2], + 'first_page': tmp[3], + 'author_name': tmp[4], + 'key': tmp[5], + }) + except KeyError: + print "Could not parse line: %s" % line + + payload = { + 'db': 'pubmed', + 'bdata': citations + } + + results = c.citmatch(**payload) + # We get data back as pipe separated, so just replace those with tabs + print results.replace('|', '\t') |
b |
diff -r 000000000000 -r 68cd8d564e0a ecitmatch.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecitmatch.xml Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,105 @@ +<?xml version="1.0"?> +<tool id="ncbi_eutils_ecitmatch" name="NCBI ECitMatch" version="@WRAPPER_VERSION@"> + <description>search NCBI for citations in PubMed</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <version_command>python ecitmatch.py --version</version_command> + <command detect_errors="aggressive" interpreter="python"><![CDATA[ecitmatch.py + +#if str($input.method) == "file": + --file $input.citation_file +#else + #set keys = '" "'.join( [ str( $citation.key ) for $citation in $input.citations ] ) + #set journal_title = '" "'.join( [ str( $citation.key ) for $citation in $input.citations ] ) + #set year = '" "'.join( [ str( $citation.key ) for $citation in $input.citations ] ) + #set volume = '" "'.join( [ str( $citation.key ) for $citation in $input.citations ] ) + #set first_page = '" "'.join( [ str( $citation.key ) for $citation in $input.citations ] ) + #set author = '" "'.join( [ str( $citation.key ) for $citation in $input.citations ] ) + + --key "$keys" + --journal_title "$journal_title" + --key "$year" + --key "$volume" + --key "$first_page" + --key "$author" +#end if + +@EMAIL_ARGUMENTS@ +> $default]]></command> + <inputs> + <conditional name="input"> + <param name="method" type="select" label="Input method"> + <option value="file">Load citations from a formatted table</option> + <option value="direct">Direct Input</option> + </param> + <when value="file"> + <param label="Citation table" name="citation_file" type="data" + format="tabular" help="Columns must be in a specific order, see help documentation"/> + </when> + <when value="direct"> + <repeat name="citations" title="Citations"> + <param name="journal_title" type="text" label="Journal Title" + help="E.g. proc natl acad sci u s a" /> + <param name="year" type="integer" label="Year" value="2000"/> + <param name="volume" type="integer" label="Volume" value="88"/> + <param name="first_page" type="integer" label="First Page" value="1"/> + <param name="author" type="text" label="Author's Name" /> + <param name="key" type="text" label="Citation key" + help="Used to match input results to NCBI's output" /> + </repeat> + </when> + </conditional> + + </inputs> + <outputs> + <data format="tabular" name="default" label="ECitMatch Results"/> + </outputs> + <tests> + <test> + <param name="method" value="file"/> + <param name="citation_file" value="ecitmatch.tsv"/> + <output name="default" file="ecitmatch.results.tsv" ftype="tabular"/> + </test> + </tests> + <help><![CDATA[ +NCBI ECitMatch +============== + +Search for citation PubMed IDs. These can be provided via a tabular file, or +via direct input. If provided via file, the columns should be ordered: + +1. Journal Name +2. Year +3. Volume +4. First Page +5. Author Name +6. Citation Key + + +An example query: + ++---------------+--------------------------+ +| Parameter | Value | ++===============+==========================+ +| Journal Title | proc natl acad sci u s a | ++---------------+--------------------------+ +| Year | 1991 | ++---------------+--------------------------+ +| Volume | 88 | ++---------------+--------------------------+ +| First Page | 3248 | ++---------------+--------------------------+ +| Author Name | mann bj | ++---------------+--------------------------+ +| Citation Key | citation_1 | ++---------------+--------------------------+ + + +@REFERENCES@ + +@DISCLAIMER@ + ]]></help> + <expand macro="citations"/> +</tool> |
b |
diff -r 000000000000 -r 68cd8d564e0a efetch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/efetch.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,35 @@ +#!/usr/bin/env python +import argparse +import eutils + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='EFetch', epilog='') + parser.add_argument('db', help='Database to use') + parser.add_argument('--user_email', help="User email") + parser.add_argument('--admin_email', help="Admin email") + + # ID source + parser.add_argument('--id_list', help='list of ids') + parser.add_argument('--id', help='Comma separated individual IDs') + parser.add_argument('--history_file', help='Fetch results from previous query') + + # Output + parser.add_argument('--retmode', help='Retmode') + parser.add_argument('--rettype', help='Rettype') + args = parser.parse_args() + + c = eutils.Client(history_file=args.history_file, user_email=args.user_email, admin_email=args.admin_email) + merged_ids = c.parse_ids(args.id_list, args.id, args.history_file) + + payload = {} + if args.history_file is not None: + payload.update(c.get_history()) + else: + payload['id'] = ','.join(merged_ids) + + for attr in ('retmode', 'rettype'): + if getattr(args, attr, None) is not None: + payload[attr] = getattr(args, attr) + + c.fetch(args.db, ftype=args.retmode, **payload) |
b |
diff -r 000000000000 -r 68cd8d564e0a egquery.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/egquery.py Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,20 @@ +#!/usr/bin/env python +import argparse +import eutils + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='EGQuery', epilog='') + parser.add_argument('term', help='Query') + # + parser.add_argument('--user_email', help="User email") + parser.add_argument('--admin_email', help="Admin email") + args = parser.parse_args() + + c = eutils.Client(user_email=args.user_email, admin_email=args.admin_email) + + payload = { + 'term': args.term, + } + results = c.gquery(**payload) + print results |
b |
diff -r 000000000000 -r 68cd8d564e0a einfo.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/einfo.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,18 @@ +#!/usr/bin/env python +import argparse +import eutils + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='EInfo', epilog='') + parser.add_argument('--db', help='Database to use') + parser.add_argument('--user_email', help="User email") + parser.add_argument('--admin_email', help="Admin email") + args = parser.parse_args() + + c = eutils.Client(user_email=args.user_email, admin_email=args.admin_email) + payload = {} + if args.db is not None: + payload['db'] = args.db + payload['version'] = '2.0' + print c.info(**payload) |
b |
diff -r 000000000000 -r 68cd8d564e0a elink.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/elink.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,59 @@ +#!/usr/bin/env python +import argparse +import json + +import eutils + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='EFetch', epilog='') + parser.add_argument('db', help='Database to use, sometimes "none" (e.g. *check)') + parser.add_argument('dbfrom', help='Database containing input UIDs') + parser.add_argument('cmd', choices=['neighbor', 'neighbor_score', + 'neighbor_history', 'acheck', 'ncheck', 'lcheck', + 'llinks', 'llinkslib', 'prlinks'], + help='ELink command mode') + # Only used in case of neighbor_history + parser.add_argument('--history_out', type=argparse.FileType('w'), + help='Output history file', default='-') + + parser.add_argument('--user_email', help="User email") + parser.add_argument('--admin_email', help="Admin email") + # ID Sources + parser.add_argument('--id_list', help='list of ids') + parser.add_argument('--id', help='Comma separated individual IDs') + parser.add_argument('--history_file', help='Fetch results from previous query') + + # TODO: dates, linkname, term, holding + # neighbor or neighbor_history and dbfrom is pubmed + # parser.add_argument('--datetype', help='Date type') + # parser.add_argument('--reldate', help='In past N days') + # parser.add_argument('--mindate', help='Minimum date') + # parser.add_argument('--maxdate', help='maximum date') + + # Output + args = parser.parse_args() + + c = eutils.Client(history_file=args.history_file, user_email=args.user_email, admin_email=args.admin_email) + merged_ids = c.parse_ids(args.id_list, args.id, args.history_file) + + payload = { + 'dbfrom': args.dbfrom, + 'cmd': args.cmd, + } + if args.history_file is not None: + payload.update(c.get_history()) + else: + payload['id'] = ','.join(merged_ids) + + # DB can be 'none' in a few cases. + if args.db != "none": + payload['db'] = args.db + + results = c.link(**payload) + + if args.cmd == "neighbor_history": + history = c.extract_history(results) + args.history_out.write(json.dumps(history, indent=4)) + + print results |
b |
diff -r 000000000000 -r 68cd8d564e0a epost.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/epost.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,27 @@ +#!/usr/bin/env python +import argparse +import eutils + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='EPost', epilog='') + parser.add_argument('db', help='Database to use') + parser.add_argument('--id_list', help='list of ids') + parser.add_argument('--id', help='Comma separated individual IDs') + parser.add_argument('--history_file', help='Post to new QueryKey in an existing WebEnv') + parser.add_argument('--user_email', help="User email") + parser.add_argument('--admin_email', help="Admin email") + + args = parser.parse_args() + + c = eutils.Client(history_file=args.history_file, user_email=args.user_email, admin_email=args.admin_email) + merged_ids = c.parse_ids(args.id_list, args.id, args.history_file) + + payload = {} + if args.history_file is not None: + payload.update(c.get_history()) + else: + payload['id'] = ','.join(merged_ids) + payload['WebEnv'] = '' + + print c.post(args.db, **payload) |
b |
diff -r 000000000000 -r 68cd8d564e0a esearch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/esearch.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,47 @@ +#!/usr/bin/env python +import json +import argparse +import eutils + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ESearch', epilog='') + parser.add_argument('db', help='Database to use') + parser.add_argument('term', help='Query') + parser.add_argument('--history_file', help='Filter existing history') + parser.add_argument('--datetype', help='Date type') + parser.add_argument('--reldate', help='In past N days') + parser.add_argument('--mindate', help='Minimum date') + parser.add_argument('--maxdate', help='maximum date') + # History + parser.add_argument('--history_out', type=argparse.FileType('w'), + help='Output history file') + parser.add_argument('--user_email', help="User email") + parser.add_argument('--admin_email', help="Admin email") + args = parser.parse_args() + + c = eutils.Client(history_file=args.history_file, user_email=args.user_email, admin_email=args.admin_email) + + payload = { + 'db': args.db, + 'term': args.term, + 'retstart': 0, + 'retmax': 20, + # hmmm @ retmax + } + if args.history_file is not None: + payload.update(c.get_history()) + if args.history_out is not None: + payload['usehistory'] = 'y' + + for attr in ('datetype', 'reldate', 'mindate', 'maxdate'): + if getattr(args, attr, None) is not None: + payload[attr] = getattr(args, attr) + + results = c.search(**payload) + + if args.history_out is not None: + history = c.extract_history(results) + args.history_out.write(json.dumps(history, indent=4)) + + print results |
b |
diff -r 000000000000 -r 68cd8d564e0a esummary.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/esummary.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,29 @@ +#!/usr/bin/env python +import argparse +import eutils + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ESummary', epilog='') + parser.add_argument('db', help='Database to use') + parser.add_argument('--id_list', help='list of ids') + parser.add_argument('--id', help='Comma separated individual IDs') + parser.add_argument('--history_file', help='Filter existing history') + parser.add_argument('--user_email', help="User email") + parser.add_argument('--admin_email', help="Admin email") + args = parser.parse_args() + + c = eutils.Client(history_file=args.history_file, user_email=args.user_email, admin_email=args.admin_email) + + merged_ids = c.parse_ids(args.id_list, args.id, args.history_file) + + payload = { + 'db': args.db, + } + + if args.history_file is not None: + payload.update(c.get_history()) + else: + payload['id'] = ','.join(merged_ids) + + print c.summary(**payload) |
b |
diff -r 000000000000 -r 68cd8d564e0a eutils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eutils.py Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,127 @@ +import os +import json +import StringIO +from Bio import Entrez +Entrez.tool = "GalaxyEutils_1_0" +BATCH_SIZE = 200 + + +class Client(object): + + def __init__(self, history_file=None, user_email=None, admin_email=None): + self.using_history = False + + if user_email is not None and admin_email is not None: + Entrez.email = ';'.join((admin_email, user_email)) + elif user_email is not None: + Entrez.email = user_email + elif admin_email is not None: + Entrez.email = admin_email + else: + Entrez.email = os.environ.get('NCBI_EUTILS_CONTACT', None) + + if Entrez.email is None: + raise Exception("Cannot continue without an email; please set " + "administrator email in NCBI_EUTILS_CONTACT") + + if history_file is not None: + with open(history_file, 'r') as handle: + data = json.loads(handle.read()) + self.query_key = data['QueryKey'] + self.webenv = data['WebEnv'] + self.using_history = True + + def get_history(self): + if not self.using_history: + return {} + else: + return { + 'query_key': self.query_key, + 'WebEnv': self.webenv, + } + + def post(self, database, **payload): + return json.dumps(Entrez.read(Entrez.epost(database, **payload)), indent=4) + + def fetch(self, db, ftype=None, **payload): + os.makedirs("downloads") + + if 'id' in payload: + summary = self.id_summary(db, payload['id']) + else: + summary = self.history_summary(db) + + count = len(summary) + payload['retmax'] = BATCH_SIZE + + # This may be bad. I'm not sure yet. I think it will be ... but UGH. + for i in range(0, count, BATCH_SIZE): + payload['retstart'] = i + file_path = os.path.join('downloads', 'EFetch Results Chunk %s.%s' % (i, ftype)) + with open(file_path, 'w') as handle: + handle.write(Entrez.efetch(db, **payload).read()) + + def id_summary(self, db, id_list): + payload = { + 'db': db, + 'id': id_list, + } + return Entrez.read(Entrez.esummary(**payload)) + + def history_summary(self, db): + if not self.using_history: + raise Exception("History must be available for this method") + + payload = { + 'db': db, + 'query_key': self.query_key, + 'WebEnv': self.webenv, + } + return Entrez.read(Entrez.esummary(**payload)) + + def summary(self, **payload): + return Entrez.esummary(**payload).read() + + def link(self, **payload): + return Entrez.elink(**payload).read() + + def extract_history(self, xml_data): + parsed_data = Entrez.read(StringIO.StringIO(xml_data)) + history = {} + for key in ('QueryKey', 'WebEnv'): + if key in parsed_data: + history[key] = parsed_data[key] + + return history + + def search(self, **payload): + return Entrez.esearch(**payload).read() + + def info(self, **kwargs): + return Entrez.einfo(**kwargs).read() + + def gquery(self, **kwargs): + return Entrez.egquery(**kwargs).read() + + def citmatch(self, **kwargs): + return Entrez.ecitmatch(**kwargs).read() + + @classmethod + def parse_ids(cls, id_list, id, history_file): + """Parse IDs passed on --cli or in a file passed to the cli + """ + merged_ids = [] + if id is not None: + for pid in id.replace('__cn__', ',').replace('\n', ',').split(','): + if pid is not None and len(pid) > 0: + merged_ids.append(pid) + + if id_list is not None: + with open(id_list, 'r') as handle: + merged_ids += [x.strip() for x in handle.readlines()] + + # Exception hanlded here for uniformity + if len(merged_ids) == 0 and history_file is None: + raise Exception("Must provide history file or IDs") + + return merged_ids |
b |
diff -r 000000000000 -r 68cd8d564e0a eutils.pyc |
b |
Binary file eutils.pyc has changed |
b |
diff -r 000000000000 -r 68cd8d564e0a macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Jul 07 02:39:21 2016 -0400 |
[ |
b'@@ -0,0 +1,847 @@\n+<?xml version="1.0"?>\n+<macros>\n+ <token name="@WRAPPER_VERSION@">1.1</token>\n+ <token name="@EMAIL_ARGUMENTS@">\n+--user_email "$__user_email__"\n+#set admin_emails = \';\'.join(str($__admin_users__).split(\',\'))\n+--admin_email "$admin_emails"\n+ </token>\n+ <!-- TODO: citation -->\n+ <token name="@REFERENCES@"><![CDATA[\n+ ]]></token>\n+ <token name="@DISCLAIMER@"><![CDATA[\n+Usage Guidelines and Requirements\n+=================================\n+\n+Frequency, Timing, and Registration of E-utility URL Requests\n+-------------------------------------------------------------\n+\n+In order not to overload the E-utility servers, NCBI recommends that users\n+limit large jobs to either weekends or between 9:00 PM and 5:00 AM Eastern time\n+during weekdays. Failure to comply with this policy may result in an IP address\n+being blocked from accessing NCBI.\n+\n+Minimizing the Number of Requests\n+---------------------------------\n+\n+If a task requires searching for and/or downloading a large number of\n+records, it is much more efficient to use the Entrez History to upload\n+and/or retrieve these records in batches rather than using separate\n+requests for each record. Please refer to Application 3 in Chapter 3\n+for an example. Many thousands of IDs can be uploaded using a single\n+EPost request, and several hundred records can be downloaded using one\n+EFetch request.\n+\n+\n+Disclaimer and Copyright Issues\n+-------------------------------\n+\n+In accordance with requirements of NCBI\'s E-Utilities, we must provide\n+the following disclaimer:\n+\n+Please note that abstracts in PubMed may incorporate material that may\n+be protected by U.S. and foreign copyright laws. All persons\n+reproducing, redistributing, or making commercial use of this\n+information are expected to adhere to the terms and conditions asserted\n+by the copyright holder. Transmission or reproduction of protected\n+items beyond that allowed by fair use (PDF) as defined in the copyright\n+laws requires the written permission of the copyright owners. NLM\n+provides no legal advice concerning distribution of copyrighted\n+materials. Please consult your legal counsel. If you wish to do a large\n+data mining project on PubMed data, you can enter into a licensing\n+agreement and lease the data for free from NLM. For more information on\n+this please see `http://www.nlm.nih.gov/databases/leased.html <http://www.nlm.nih.gov/databases/leased.html>`__\n+\n+The `full disclaimer <http://www.ncbi.nlm.nih.gov/About/disclaimer.html>`__ is available on\n+their website\n+\n+Liability\n+~~~~~~~~~\n+\n+For documents and software available from this server, the\n+U.S. Government does not warrant or assume any legal liability or\n+responsibility for the accuracy, completeness, or usefulness of any\n+information, apparatus, product, or process disclosed.\n+\n+Endorsement\n+~~~~~~~~~~~\n+\n+NCBI does not endorse or recommend any commercial\n+products, processes, or services. The views and opinions of authors\n+expressed on NCBI\'s Web sites do not necessarily state or reflect those\n+of the U.S. Government, and they may not be used for advertising or\n+product endorsement purposes.\n+\n+External Links\n+~~~~~~~~~~~~~~\n+\n+Some NCBI Web pages may provide links to other Internet\n+sites for the convenience of users. NCBI is not responsible for the\n+availability or content of these external sites, nor does NCBI endorse,\n+warrant, or guarantee the products, services, or information described\n+or offered at these other Internet sites. Users cannot assume that the\n+external sites will abide by the same Privacy Policy to which NCBI\n+adheres. It is the responsibility of the user to examine the copyright\n+and licensing restrictions of linked pages and to secure all necessary\n+permissions.\n+ ]]></token>\n+ <xml name="dbselect"\n+ token_name="db_select"\n+ token_label="NCBI Database to Use"\n+ >\n+ <param name="@NAME@" type="select" label="@LABEL@">\n+ <option value="annotinfo">Annotation Information</option>'..b'output_format" value="ipg-text" format="text"/>\n+ <when input="output_format" value="uilist-xml" format="xml"/>\n+ <when input="output_format" value="docsum-xml" format="xml"/>\n+ <when input="output_format" value="rsr-tabular" format="tabular"/>\n+ <when input="output_format" value="uilist-text" format="text"/>\n+ <when input="output_format" value="gb-text" format="text"/>\n+ <when input="output_format" value="chr-text" format="text"/>\n+ <when input="output_format" value="alignmentscores-text" format="text"/>\n+ <when input="output_format" value="native-asn.1" format="asn1"/>\n+ <when input="output_format" value="gp-xml" format="xml"/>\n+ <when input="output_format" value="tabular-tabular" format="tabular"/>\n+ <when input="output_format" value="ssexemplar-text" format="text"/>\n+ <when input="output_format" value="docsum-json" format="json"/>\n+ <when input="output_format" value="fasta-xml" format="xml"/>\n+ <when input="output_format" value="runinfo-xml" format="xml"/>\n+ <when input="output_format" value="flt-text" format="text"/>\n+ <when input="output_format" value="fasta-fasta" format="fasta"/>\n+ <when input="output_format" value="full-text" format="text"/>\n+ <when input="output_format" value="gb-xml" format="xml"/>\n+ <when input="output_format" value="abstract-xml" format="xml"/>\n+ <when input="output_format" value="full-xml" format="xml"/>\n+ <when input="output_format" value="ft-text" format="text"/>\n+ <when input="output_format" value="homologene-text" format="text"/>\n+ <when input="output_format" value="est-xml" format="xml"/>\n+ <when input="output_format" value="gene_table-xml" format="xml"/>\n+ <when input="output_format" value="docset-text" format="text"/>\n+ <when input="output_format" value="native-xml" format="xml"/>\n+ </change_format>\n+ </xml>\n+ <token name="@LIST_OR_HIST@">\n+#if $query_source.qss == "history":\n+ --history_file $query_source.history_file\n+#else if $query_source.qss == "id_file":\n+ --id_list $query_source.id_file\n+#else if $query_source.qss == "id_list":\n+ --id $query_source.id_list\n+#end if\n+ </token>\n+ <xml name="list_or_hist">\n+ <conditional name="query_source">\n+ <param name="qss" type="select" label="Select source for IDs">\n+ <option value="history">NCBI WebEnv History</option>\n+ <option value="id_file">File containing IDs (one per line)</option>\n+ <option value="id_list">Direct Entry</option>\n+ </param>\n+ <when value="history">\n+ <param label="History File" name="history_file" type="data" format="json"/>\n+ </when>\n+ <when value="id_file">\n+ <param label="ID List" name="id_file" type="data" format="text,tabular"/>\n+ </when>\n+ <when value="id_list">\n+ <param label="ID List" name="id_list" type="text" area="true" help="Newline/Comma separated list of IDs"/>\n+ </when>\n+ </conditional>\n+ </xml>\n+ <xml name="history_out">\n+ <data format="json" name="history" label="NCBI Entrez WebEnv History">\n+ <yield/>\n+ </data>\n+ </xml>\n+ <xml name="citations">\n+ <citations>\n+ <citation type="bibtex">@Book{ncbiEutils,\n+ author = {Eric Sayers},\n+ title = {Entrez Programming Utilities Help},\n+ year = {2010},\n+ publisher = {National Center for Biotechnology Information, Bethesda, Maryland},\n+ note = {http://ww.ncbi.nlm.nih.gov/books/NBK25500/}\n+ }</citation>\n+ </citations>\n+ </xml>\n+ <xml name="requirements">\n+ <requirements>\n+ <requirement type="package" version="2.7">python</requirement>\n+ <requirement type="package" version="1.66">biopython</requirement>\n+ </requirements>\n+ </xml>\n+ <xml name="linkname">\n+ <param name="linkname" type="select" label="To NCBI Database">\n+ <!-- TODO: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html -->\n+ </param>\n+ </xml>\n+</macros>\n' |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/ecitmatch.results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ecitmatch.results.tsv Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,2 @@ + 1991 88 3248 mann bj citation_1 2014248 + |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/ecitmatch.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ecitmatch.tsv Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,2 @@ +#journal year volume first page author key +proc natl acad sci u s a 1991 88 3248 mann bj citation_1 |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/egquery.1.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/egquery.1.xml Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,7 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE Result PUBLIC "-//NLM//DTD eSearchResult, January 2004//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/egquery.dtd"> +<Result> + + <Term>bacteriophage</Term> + + <eGQueryResult> |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/esearch.pubmed.2014-01-pnas.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/esearch.pubmed.2014-01-pnas.xml Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "http://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd"> +<eSearchResult><Count>524</Count><RetMax>20</RetMax><RetStart>0</RetStart><IdList> +<Id>24620368</Id> +<Id>24613929</Id> +<Id>24596955</Id> +<Id>24596954</Id> +<Id>24571024</Id> +<Id>24555201</Id> +<Id>24555200</Id> +<Id>24550301</Id> +<Id>24520173</Id> +<Id>24520172</Id> +<Id>24497494</Id> +<Id>24497493</Id> +<Id>24488973</Id> +<Id>24488972</Id> +<Id>24488971</Id> +<Id>24481254</Id> +<Id>24481253</Id> +<Id>24481252</Id> +<Id>24477693</Id> +<Id>24477692</Id> +</IdList><TranslationSet><Translation> <From>PNAS[ta]</From> <To>"Proc Natl Acad Sci U S A"[Journal]</To> </Translation></TranslationSet><TranslationStack> <TermSet> <Term>"Proc Natl Acad Sci U S A"[Journal]</Term> <Field>Journal</Field> <Count>124812</Count> <Explode>N</Explode> </TermSet> <TermSet> <Term>2014/01/01[PDAT]</Term> <Field>PDAT</Field> <Count>0</Count> <Explode>N</Explode> </TermSet> <TermSet> <Term>2014/02/01[PDAT]</Term> <Field>PDAT</Field> <Count>0</Count> <Explode>N</Explode> </TermSet> <OP>RANGE</OP> <OP>AND</OP> </TranslationStack><QueryTranslation>"Proc Natl Acad Sci U S A"[Journal] AND 2014/01/01[PDAT] : 2014/02/01[PDAT]</QueryTranslation></eSearchResult> + |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/esearch.pubmed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/esearch.pubmed.xml Thu Jul 07 02:39:21 2016 -0400 |
[ |
@@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "http://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd"> +<eSearchResult><Count>2651</Count><RetMax>20</RetMax><RetStart>0</RetStart><IdList> +<Id>16578858</Id> +<Id>11186225</Id> +<Id>11121081</Id> +<Id>11121080</Id> +<Id>11121079</Id> +<Id>11121078</Id> +<Id>11121077</Id> +<Id>11121076</Id> +<Id>11121075</Id> +<Id>11121074</Id> +<Id>11121073</Id> +<Id>11121072</Id> +<Id>11121071</Id> +<Id>11121070</Id> +<Id>11121069</Id> +<Id>11121068</Id> +<Id>11121067</Id> +<Id>11121066</Id> +<Id>11121065</Id> +<Id>11121064</Id> +</IdList><TranslationSet><Translation> <From>PNAS[ta]</From> <To>"Proc Natl Acad Sci U S A"[Journal]</To> </Translation></TranslationSet><TranslationStack> <TermSet> <Term>"Proc Natl Acad Sci U S A"[Journal]</Term> <Field>Journal</Field> <Count>124812</Count> <Explode>N</Explode> </TermSet> <TermSet> <Term>97[vi]</Term> <Field>vi</Field> <Count>77218</Count> <Explode>N</Explode> </TermSet> <OP>AND</OP> <OP>GROUP</OP> </TranslationStack><QueryTranslation>"Proc Natl Acad Sci U S A"[Journal] AND 97[vi]</QueryTranslation></eSearchResult> + |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/esummary.tax.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/esummary.tax.xml Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary v1 20041029//EN" "http://eutils.ncbi.nlm.nih.gov/eutils/dtd/20041029/esummary-v1.dtd"> +<eSummaryResult> +<DocSum> + <Id>10239</Id> + <Item Name="Status" Type="String">active</Item> + <Item Name="Rank" Type="String">superkingdom</Item> + <Item Name="Division" Type="String">viruses</Item> + <Item Name="ScientificName" Type="String">Viruses</Item> + <Item Name="CommonName" Type="String"></Item> + <Item Name="TaxId" Type="Integer">10239</Item> + <Item Name="AkaTaxId" Type="Integer">0</Item> + <Item Name="Genus" Type="String"></Item> + <Item Name="Species" Type="String"></Item> + <Item Name="Subsp" Type="String"></Item> + <Item Name="ModificationDate" Type="Date">2010/11/23 00:00</Item> +</DocSum> + +</eSummaryResult> + |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/example.history.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/example.history.json Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,4 @@ +{ + "QueryKey": "1", + "WebEnv": "NCID_1_9485527_130.14.22.215_9001_1430928295_33285243_0MetA0_S_MegaStore_F_1" +} |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/pm-tax-neighbor.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pm-tax-neighbor.xml Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE eLinkResult PUBLIC "-//NLM//DTD elink 20101123//EN" "http://eutils.ncbi.nlm.nih.gov/eutils/dtd/20101123/elink.dtd"> +<eLinkResult> + + <LinkSet> + <DbFrom>taxonomy</DbFrom> + <IdList> + <Id>510899</Id> + </IdList> + + <LinkSetDb> + <DbTo>pubmed</DbTo> + <LinkName>taxonomy_pubmed_entrez</LinkName> + + <Link> + <Id>22241621</Id> + </Link> + + </LinkSetDb> + + + </LinkSet> +</eLinkResult> + |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/pubmed.metadata.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pubmed.metadata.xml Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,7 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE eInfoResult PUBLIC "-//NLM//DTD einfo 20130322//EN" "http://eutils.ncbi.nlm.nih.gov/eutils/dtd/20130322/einfo.dtd"> +<eInfoResult> + <DbInfo> + <DbName>pubmed</DbName> + <MenuName>PubMed</MenuName> + <Description>PubMed bibliographic record</Description> |
b |
diff -r 000000000000 -r 68cd8d564e0a test-data/viruses.tax.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/viruses.tax.xml Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,29 @@ +<?xml version="1.0"?> +<!DOCTYPE TaxaSet PUBLIC "-//NLM//DTD Taxon, 14th January 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/taxon.dtd"> +<TaxaSet><Taxon> + <TaxId>10239</TaxId> + <ScientificName>Viruses</ScientificName> + <OtherNames> + <BlastName>viruses</BlastName> + <Synonym>Vira</Synonym> + <Synonym>Viridae</Synonym> + </OtherNames> + <ParentTaxId>1</ParentTaxId> + <Rank>superkingdom</Rank> + <Division>Viruses</Division> + <GeneticCode> + <GCId>1</GCId> + <GCName>Standard</GCName> + </GeneticCode> + <MitoGeneticCode> + <MGCId>0</MGCId> + <MGCName>Unspecified</MGCName> + </MitoGeneticCode> + <Lineage/> + <CreateDate>1995/02/27 09:24:00</CreateDate> + <UpdateDate>2010/11/23 11:40:11</UpdateDate> + <PubDate>1993/04/20 01:00:00</PubDate> +</Taxon> + +</TaxaSet> + |
b |
diff -r 000000000000 -r 68cd8d564e0a tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Jul 07 02:39:21 2016 -0400 |
b |
@@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<tool_dependency> + <set_environment version="1.0"> + <environment_variable action="set_to" name="NCBI_EUTILS_CONTACT">/please set the administrator's contact email in the corresponding env.sh file/</environment_variable> + </set_environment> + <package name="biopython" version="1.66"> + <repository changeset_revision="8433ee4531ff" name="package_biopython_1_66" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency> |