Mercurial > repos > galaxyp > uniprotxml_downloader
comparison uniprotxml_downloader.py @ 6:a371252a2cf6 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 0c5222345ace5054df44da29cab278f4a02e2b41
author | galaxyp |
---|---|
date | Thu, 06 Jul 2023 21:15:39 +0000 |
parents | 265c35540faa |
children |
comparison
equal
deleted
inserted
replaced
5:265c35540faa | 6:a371252a2cf6 |
---|---|
45 | 45 |
46 | 46 |
47 def __main__(): | 47 def __main__(): |
48 # Parse Command Line | 48 # Parse Command Line |
49 parser = optparse.OptionParser() | 49 parser = optparse.OptionParser() |
50 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') | 50 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids') |
51 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs') | 51 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') |
52 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') | 52 parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') |
53 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') | 53 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') |
54 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') | 54 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') |
55 parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id'], default='taxonomy_name', help='query field') | 55 parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') |
56 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') | 56 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') |
57 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') | 57 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') |
58 (options, args) = parser.parse_args() | 58 (options, args) = parser.parse_args() |
59 taxids = set(options.taxon) | 59 search_ids = set(options.search_id) |
60 if options.input: | 60 if options.input: |
61 with open(options.input, 'r') as inputFile: | 61 with open(options.input, 'r') as inputFile: |
62 for linenum, line in enumerate(inputFile): | 62 for linenum, line in enumerate(inputFile): |
63 if line.startswith('#'): | 63 if line.startswith('#'): |
64 continue | 64 continue |
65 fields = line.rstrip('\r\n').split('\t') | 65 fields = line.rstrip('\r\n').split('\t') |
66 if len(fields) > abs(options.column): | 66 if len(fields) > abs(options.column): |
67 taxid = fields[options.column].strip() | 67 search_id = fields[options.column].strip() |
68 if taxid: | 68 if search_id: |
69 taxids.add(taxid) | 69 search_ids.add(search_id) |
70 taxon_queries = [f'{options.field}:"{taxid}"' for taxid in taxids] | 70 search_queries = [f'{options.field}:"{search_id}"' for search_id in search_ids] |
71 taxon_query = ' OR '.join(taxon_queries) | 71 search_query = ' OR '.join(search_queries) |
72 if options.output: | 72 if options.output: |
73 dest_path = options.output | 73 dest_path = options.output |
74 else: | 74 else: |
75 dest_path = "uniprot_%s.xml" % '_'.join(taxids) | 75 dest_path = "uniprot_%s.xml" % '_'.join(search_ids) |
76 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' | 76 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' |
77 try: | 77 try: |
78 url = 'https://rest.uniprot.org/uniprotkb/stream' | 78 url = 'https://rest.uniprot.org/uniprotkb/stream' |
79 query = "%s%s" % (taxon_query, reviewed) | 79 query = "%s%s" % (search_query, reviewed) |
80 params = {'query': query, 'format': options.format} | 80 params = {'query': query, 'format': options.format} |
81 if options.debug: | 81 if options.debug: |
82 print("%s ? %s" % (url, params), file=sys.stderr) | 82 print("%s ? %s" % (url, params), file=sys.stderr) |
83 data = parse.urlencode(params) | 83 data = parse.urlencode(params) |
84 print(f"Retrieving: {url}?{data}") | 84 print(f"Retrieving: {url}?{data}") |
110 if re.match(pattern, line): | 110 if re.match(pattern, line): |
111 break | 111 break |
112 else: | 112 else: |
113 print("failed: Not a uniprot xml file", file=sys.stderr) | 113 print("failed: Not a uniprot xml file", file=sys.stderr) |
114 exit(1) | 114 exit(1) |
115 print("NCBI Taxon ID:%s" % taxids, file=sys.stdout) | 115 print("Search IDs:%s" % search_ids, file=sys.stdout) |
116 if 'X-UniProt-Release' in response.headers: | 116 if 'X-UniProt-Release' in response.headers: |
117 print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) | 117 print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) |
118 if 'X-Total-Results' in response.headers: | 118 if 'X-Total-Results' in response.headers: |
119 print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) | 119 print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) |
120 except Exception as e: | 120 except Exception as e: |