# HG changeset patch # User card # Date 1551276501 18000 # Node ID 715bc9aeef6930075f9e590485ac7251968498c8 planemo upload for repository https://github.com/arpcard/rgi commit 7a78289be23c5a14ae39f454610fa8eca3f05188 diff -r 000000000000 -r 715bc9aeef69 README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,6 @@ +RGI wrapper +-------------------- + +This wrapper is used to run Resistance Gene Identifier (RGI) on galaxy environment. + +This tool is used together with data manager for RGI: `rgi_database_builder `_. diff -r 000000000000 -r 715bc9aeef69 data_managers/data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_managers/data_manager_conf.xml Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + rgi_databases/${value}/${path} + + + ${GALAXY_DATA_MANAGER_DATA_PATH}/rgi_databases/${value}/${path} + abspath + + + + + diff -r 000000000000 -r 715bc9aeef69 data_managers/data_manager_rgi_build_db/data_manager/.gitignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_managers/data_manager_rgi_build_db/data_manager/.gitignore Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,1 @@ +*.pyc \ No newline at end of file diff -r 000000000000 -r 715bc9aeef69 data_managers/data_manager_rgi_build_db/data_manager/import_data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_managers/data_manager_rgi_build_db/data_manager/import_data.py Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,199 @@ +import argparse +import datetime +import json +import os +import shutil +import sys +import tarfile +import urllib.request, urllib.error, urllib.parse +import zipfile +import logging + +path = os.path.join(os.getcwd(), 'rgi-database') +data_path = path + +level = logging.WARNING +logger = logging.getLogger(__name__) +logger.setLevel(level) + +def url_download(url, workdir): + file_path = os.path.join(workdir, 'download.dat') + if not os.path.exists(workdir): + os.makedirs(workdir) + src = None + dst = None + try: + req = urllib.request.Request(url) + src = urllib.request.urlopen(req) + dst = open(file_path, 'wb') + while True: + chunk = src.read(2**10) + if chunk: + dst.write(chunk) + else: + break + except Exception as e: + print(str(e), file=sys.stderr) + finally: + if src: + src.close() + if dst: + dst.close() + if tarfile.is_tarfile(file_path): + fh = tarfile.open(file_path, 'r:*') + elif zipfile.is_zipfile(file_path): + fh = zipfile.ZipFile(file_path, 'r') + else: + return + # extract only one file : card.json + for member in fh.getmembers(): + if member.isreg(): # skip if the TarInfo is not files + member.name = os.path.basename(member.name) # remove the path by reset it + if member.name == 'card.json': + print('[import_data] extracting file: {}'.format(str(member.name))) + fh.extract(member.name,workdir) + os.remove(file_path) + +def checkKeyExisted(key, my_dict): + try: + nonNone = my_dict[key] is not None + except KeyError: + nonNone = False + return nonNone + +def data_version(): + data_version = '' + with open(os.path.join(data_path, 'card.json')) as json_file: + json_data = json.load(json_file) + for item in list(json_data.keys()): + if item == '_version': + data_version = json_data[item] + json_file.close() + return data_version + +def makeBlastDB(): + if os.path.isfile(os.path.join(path, 'proteindb.fsa')) == True: + print('[import_data] create blast DB.') + os.system('makeblastdb -in {}/proteindb.fsa -dbtype prot -out {}/protein.db > /dev/null 2>&1'.format(path, path)) + +def makeDiamondDB(): + if os.path.isfile(os.path.join(path, 'proteindb.fsa')) == True: + print('[import_data] create diamond DB.') + os.system('diamond makedb --quiet --in {}/proteindb.fsa --db {}/protein.db'.format(path, path)) + +def write_fasta_from_json(): + '''Creates a fasta file from card.json file.''' + if os.path.isfile(os.path.join(path, 'proteindb.fsa')): + return + else: + try: + with open(os.path.join(data_path, 'card.json'), 'r') as jfile: + j = json.load(jfile) + except Exception as e: + logger.error(e) + exit() + + with open(os.path.join(path, 'proteindb.fsa'), 'w') as fout: + for i in j: + if i.isdigit(): + # model_type: protein homolog model + if j[i]['model_type_id'] == '40292': + try: + pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value'] + except KeyError: + logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \ + % (j[i]['model_id'], j[i]['model_name'])) + logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') + else: + try: + for seq in j[i]['model_sequences']['sequence']: + fout.write('>%s_%s | model_type_id: 40292 | pass_bitscore: %s | %s\n' % (i, seq, pass_bit_score, j[i]['ARO_name'])) + fout.write('%s\n' %(j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence'])) + except Exception as e: + logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \ + % (j[i]['model_id'], j[i]['model_name'])) + logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') + + + # model_type: protein variant model + elif j[i]['model_type_id'] == '40293': + try: + pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value'] + except KeyError: + logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \ + % (j[i]['model_id'], j[i]['model_name'])) + logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') + else: + try: + snpList = [j[i]['model_param']['snp']['param_value'][k] for k in j[i]['model_param']['snp']['param_value']] + except Exception as e: + logger.warning('No snp for model (%s, %s). RGI will omit this model and keep running.' \ + % (j[i]['model_id'], j[i]['model_name'])) + logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') + + try: + for seq in j[i]['model_sequences']['sequence']: + fout.write('>%s_%s | model_type_id: 40293 | pass_bit_score: %s | SNP: %s | %s\n' \ + % (i, seq, pass_bit_score, ','.join(snpList), j[i]['ARO_name'])) + fout.write('%s\n' % (j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence'])) + except Exception as e: + logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \ + % (j[i]['model_id'], j[i]['model_name'])) + logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') + + # model_type: protein overexpression model + elif j[i]['model_type_id'] == '41091': + try: + pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value'] + except KeyError: + logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \ + % (j[i]['model_id'], j[i]['model_name'])) + logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') + else: + try: + snpList = [j[i]['model_param']['snp']['param_value'][k] for k in j[i]['model_param']['snp']['param_value']] + except Exception as e: + logger.warning('No snp for model (%s, %s). RGI will omit this model and keep running.' \ + % (j[i]['model_id'], j[i]['model_name'])) + logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') + + try: + for seq in j[i]['model_sequences']['sequence']: + fout.write('>%s_%s | model_type_id: 41091 | pass_bit_score: %s | SNP: %s | %s\n' \ + % (i, seq, pass_bit_score, ','.join(snpList), j[i]['ARO_name'])) + fout.write('%s\n' % (j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence'])) + except Exception as e: + logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \ + % (j[i]['model_id'], j[i]['model_name'])) + logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') + +def _main(args): + if not os.path.exists(path): + print('[import_data] mkdir: {}'.format(path)) + os.makedirs(path) + print('[import_data] path: {}'.format(path)) + print(args) + + if args.url == None: + url = 'https://card.mcmaster.ca/latest/data' + else: + url = args.url + print('[import_data] url: {}'.format(url)) + workdir = os.path.join(os.getcwd(), 'rgi-database') + print('[import_data] working directory: {}'.format(workdir)) + url_download(url, workdir) + write_fasta_from_json() + makeBlastDB() + makeDiamondDB() + version = data_version() + print('[import_data] data version: {}'.format(version)) + return version + +def run(): + parser = argparse.ArgumentParser(description='Create data manager json.') + parser.add_argument('--url', dest='url', action='store', help='Url for CARD data') + args = parser.parse_args() + _main(args) + +if __name__ == '__main__': + run() diff -r 000000000000 -r 715bc9aeef69 data_managers/data_manager_rgi_build_db/data_manager/rgi_database_builder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_managers/data_manager_rgi_build_db/data_manager/rgi_database_builder.py Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,47 @@ +import argparse +import datetime +import json +import os +import shutil +import sys +import tarfile +import urllib.request, urllib.error, urllib.parse +import zipfile +from import_data import run, _main + +parser = argparse.ArgumentParser(description='Create data manager json.') +parser.add_argument('--url', dest='url', action='store', help='Url for CARD data') +parser.add_argument('--out', dest='output', action='store', help='JSON filename') +parser.add_argument('--name', dest='name', action='store', default='CARD_data-' + str(datetime.datetime.now().strftime('%Y-%B-%d-%H:%M:%S')), help='Data table database name') +args = parser.parse_args() + +print('[rgi_database_builder] Importing...') + +_main(args) + +def main(args): + print('[rgi_database_builder] Building......') + + data_manager_entry = {} + data_manager_entry['value'] = args.name.lower() + data_manager_entry['name'] = args.name + data_manager_entry['path'] = '.' + + data_manager_json = dict(data_tables = dict(rgi_databases=data_manager_entry)) + + params = json.loads(open(args.output,'r').read()) + + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + output_path = os.path.join(os.getcwd(), 'rgi-database') + + for filename in os.listdir(output_path): + print('[rgi_database_builder] move file: {} from {} to {}'.format(filename, output_path, target_directory)) + shutil.move(os.path.join(output_path, filename), target_directory) + + print(args.output) + print('[rgi_database_builder] write file: {}'.format(args.output)) + open(args.output, 'w').write(json.dumps(data_manager_json)) + +if __name__ == '__main__': + main(args) diff -r 000000000000 -r 715bc9aeef69 data_managers/data_manager_rgi_build_db/data_manager/rgi_database_builder.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_managers/data_manager_rgi_build_db/data_manager/rgi_database_builder.xml Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,28 @@ + + + Download and build the CARD database for RGI + + rgi + + + + + + + + + + + + + + + diff -r 000000000000 -r 715bc9aeef69 data_managers/data_manager_rgi_build_db/tool-data/rgi_databases.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_managers/data_manager_rgi_build_db/tool-data/rgi_databases.loc.sample Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,5 @@ +#This file lists the columns that will be specified by the RGI Data Manager tool. +# +#For example: +# +#rgi_20181001 rgi_20181001 /galaxy-central/tool-data/rgi_databases/rgi_20181001 diff -r 000000000000 -r 715bc9aeef69 data_managers/data_manager_rgi_build_db/tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_managers/data_manager_rgi_build_db/tool_data_table_conf.xml.sample Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,8 @@ + + + + + value, name, path + +
+
diff -r 000000000000 -r 715bc9aeef69 rgi.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rgi.xml Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,187 @@ + + This tool predicts resistome(s) from protein or nucleotide data based on homology and SNP models. + + rgi + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1093/nar/gkw1004 + + diff -r 000000000000 -r 715bc9aeef69 test-data/test1.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1.fasta Wed Feb 27 09:08:21 2019 -0500 @@ -0,0 +1,68 @@ +>AY123251.1 Salmonella enterica subsp. enterica serovar Typhi R-plasmid pST2301 class I integron aminoglycoside 6'-N-acetyltransferase (aacA4), chloramphenicol acetyltransferase (catB8), aminoglycoside 3'-adenyltransferase (aadA1), dihydrofolate reductase type I (dhfr1), aminoglycoside 6'-n-acetyltransferase (aac6-II), and CARB-8 beta-lactamase (blaCARB-8) genes, complete cds +TCATGGCTTGTTATGACTGTTTTTTTGTACAGTCTATGCCTCGGGCATCCAAGCAGCAAGCGCGTTACGC +CGTGGGTCGATGTTTGATGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAAAACAAAG +TTAGGCATCACAAAGTACAGCATCGTGACCAACAGCAACGATTCCGTCACACTGCGCCTCATGACTGAGC +ATGACCTTGCGATGCTCTATGAGTGGCTAAATCGATCTCATATCGTCGAGTGGTGGGGCGGAGAAGAAGC +ACGCCCGACACTTGCTGACGTACAGGAACAGTACTTGCCAAGCGTTTTAGCGCAAGAGTCCGTCACTCCA +TACATTGCAATGCTGAATGGAGAGCCGATTGGGTATGCCCAGTCGTACGTTGCTCTTGGAAGCGGGGACG +GATGGTGGGAAGAAGAAACCGATCCAGGAGTACGCGGAATAGACCAGTCACTGGCGAATGCATCACAACT +GGGCAAAGGCTTGGGAACCAAGCTGGTTCGAGCTCTGGTTGAGTTGCTGTTCAATGATCCCGAGGTCACC +AAGATCCAAACGGACCCGTCGCCGAGCAACTTGCGAGCGATCCGATGCTACGAGAAAGCGGGGTTTGAGA +GGCAAGGTACCGTAACCACCCCAGATGGTCCAGCCGTGTACATGGTTCAAACACGCCAGGCATTCGAGCG +AACACGCAGTGTTGCCTAACCCTTCCATCGAGGGGGACGTCCAAGGGCTGGCGCCCTTGGCCGCCCCTCA +TGTCAAACGTTAGACGGCAAGAAAAGGTTCCACGAACTCTGATGAAAAACTACTTTAACAGCCCTTTCAA +AGGGGAACTTCTTTCTGAGCAAGTGAAAAATCCAAATATCAGAGTAGGCCGGTATAGCTATTACTCTGGC +TACTATCACGGGCACTCATTTGATGAATGCGCGCGATACTTGCTTCCAGATCGTGATGACGTTGATAAAT +TGATCATTGGCAGCTTTTGTTCTATAGGAAGCGGGGCTTCCTTCATCATGGCTGGCAATCAGGGGCATCG +GCATGACTGGGCATCATCCTTCCCCTTCTTCTATATGCAAGAGGAGCCTGCTTTCTCAAGAGCACTCGAC +GCCTTCCAAAGAGCAGGTGATACCGTCATTGGCAATGATGTCTGGATAGGCTCGGAGGCAATGATTATGC +CTGGCATCAAAATTGGAGACGGTGCCGTGATAGGTAGTCGCTCGTTGGTGACAAAAGATGTAGAGCCTTA +TGCCATCATCGGGGGAAATCCCGCAAAGCAAATTAAGAAGCGCTTCTCCGATGAGGAAATCTCATTGCTC +ATGGAGATGGAGTGGTGGAACTGGCCACTAGATAAAATTAAGACAGCAATGCCTCTGCTGTGCTCGTCAA +ATATTTTTGGTCTGCATAAGTATTGGCGCGAGTTTGCCGTCTAACAATTCATTCAAGCCGACGCCGCTTC +GCGGCACGGCTTAATTCTGGCGTTAAACATCATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTA +TCAGAGGTAGTTGGCGTCATCGAGCGCCATCTCGAACCGACGTTGCTGGCCGTACATTTGTACGGCTCCG +CAGTGGATGGCGGCCTGAAGCCACACAGTGATATTGATTTGCTGGTTACGGTGACCGTAAGGCTTGATGA +AACAACGCGGCGAGCTTTGATCAACGACCTTTTGGAAACTTCGGCTTCCCCTGGAGAGAGCGAGATTCTC +CGCGCTGTAGAAGTCACCATTGTTGTGCACGACGACATCATTCCGTGGCGTTATCCAGCTAAGCGCGAAC +TGCAATTTGGAGAATGGCAGCGCAATGACATTCTTGCAGGTATCTTCGAGCCAGCCACGATCGACATTGA +TCTGGCTATCTTGCTGACAAAAGCAAGAGAACATAGCGTTGCCTTGGTAGGTCCAGCGGCGGAGGAACTC +TTTGATCCGGTTCCTGAACAGGATCTATTTGAGGCGCTAAATGAAACCTTAACGCTATGGAACTCGCCGC +CCGACTGGGCTGGCGATGAGCGAAATGTAGTGCTTACGTTGTCCCGCATTTGGTACAGCGCAGTAACCGG +CAAAATCGCGCCGAAGGATGTCGCTGCCGACTGGGCAATGGAGCGCCTGCCGGCCCAGTATCAGCCCGTC +ATACTTGAAGCTAGACAGGCTTATCTTGGACAAGAAGAAGATCGCTTGGCCTCCCGCGCAGATCAGTTGG +AAGAATTTGTTCACTACGTGAAAGGCGAGATCACCAAGGTAGTCGGCAAATAATGTCTAACAATTCGTTC +AAGCCGACGCCGCTTCGCGGCGCGGCTTAACTCAAGCGTTAACCTCTGAGGAAGAATTGTGAAACTATCA +CTAATGGTAGCTATATCGAAGAATGGAGTTATCGGGAATGGCCCTGATATTCCATGGAGTGCCAAAGGTG +AACAGCTCCTGTTTAAAGCTATTACCTATAACCAATGGCTGTTGGTTGGACGCAAGACTTTTGAGTCAAT +GGGAGCATTACCCAACCGAAAGTATGCGGTCGTAACACGTTCAAGTTTTACATCTGACAATGAGAACGTA +GTGATCTTTCCATCAATTAAAGATGCTTTAACCAACCTAAAGAAAATAACGGATCATGTCATTGTTTCAG +GTGGTGGGGAGATATACAAAAGCCTGATCGATCAAGTAGATACACTACATATATCTACAATAGACATCGA +GCCGGAAGGTGATGTTTACTTTCCTGAAATCCCCAGCAATTTTAGGCCAGTTTTTACCCAAGACTTCGCC +TCTAACATAAATTATAGTTACCAAATCTGGCAAAAGGGTTAACAAGTGGCAGCAACGGATTCGCAAACCT +GTCACGCCTTTTGTACCAAAACCCGCGCCAGGTTTGCGATCCGCTGTGCCAGGCGTTAGGCAGCACAGAG +CGACCATTTCATGTCCGCGAGCACCCCCCCCATAACTCTTCGCCTCATGACCGAGCGCGACCTGCCGATG +CTCCATGATTGGCTCAACCGGCCGCACATCGTTGAGTGGTGGGGTGGTGACGAAGAGCGACCGACTCTTG +ATGAAGTGCTGGAACACTACCTGCCCAGAGCGATGGCGGAAGAGTCCGTAACACCGTACATCGCAATGCT +GGGCGAGGAACCGATCGGCTATGCTCAGTCGTACGTCGCGCTCGGAAGCGGTGATGGCTGGTGGGAAGAT +GAAACTGATCCAGGAGTGCGAGGAATAGACCAGTCTCTGGCTGACCCGACACAGTTGAACAAAGGCCTAG +GAACAAGGCTTGTCCGCGCTCTCGTTGAACTACTGTTCTCGGACCCCACCGTGACGAAGATTCAGACCGA +CCCGACTCCGAACAACCATCGAGCCATACGCTGCTATGAGAAGGCAGGATTCGTGCGGGAGAAGATCATC +ACCACGCCTGACGGGCCGGCGGTTTACATGGTTCAAACACGACAAGCCTTCGAGAGAAAGCGCGGTGTTG +CCTAACAACTCATTCAAGCCGACGCCGCTTCGCGGCGCGGCTTAATTCAGGTGTTAGCCATATTATGGAG +CCTCATGCTTTTATATAAAATGTGTGACAATCAAAATTATGGGGTTACTTACATGAAGTTTTTATTGGTA +TTTTCGCTTTTAATACCATCCGTGGTTTTTGCAAGTAGTTCAAAGTTTCGGCAAGTTGAACAAGACGTTA +AGGCAATTGAAGTTTCTCTTTCTGCTCGTATAGGTGTTTCCGTTCTTGATACTCAAAATGGAGAATACTG +GGATTACAATGGCAATCAGCGCTTCCCGTTGACAAGTACTTTTAAAACAATAGCTTGCGCTAAATTACTA +TATGATGCTGAGCAAGGAAAAGTTAATCCCAATAGTACAATCGAGATTAAGAAAGCAGATCTTGTGACCT +ATTCCCCTGTAATAGAAAAGCAAGTAGGGCAGGCAATCACACTCGATGATGCGTGCTTCGCAACTATGAC +TACAAGTGATAATACTGCGGCAAATATCATCCTAAGTGCTGTAGGTGGCTCCAAAGGCGTTACTGATTTT +TTAAGACAAATTGGGGACAAAGAGACTCGTCTAGACCGTATTGAGCCTGATTTAAATGAAGGTAAGCTCG +GTGATTTGAGGGATACGACAACTCCTAAGGCAATAGCCAGTACTTTGAATAAATTTTTATTTGGTTCAGC +GCTATCTGAAATGAACAAAAAAAAATTAGAGTCTTGGATGGTGAACAATCAAGTCACTGGTAATTTACTA +CGTTCAGTATTGCCGGCGGGATGGAACATTGCGGATCGTTCAGGTGCTGGCGGATTTGGTGCTCGGAGTA +TTACAGCAGTTGTGTGGAGTGAGCATCAAGCCCCAATTATTGTGAGCATCTATCTAGCTCAAACACAGGC +TTCAATGGCAGAGCGAAATGATGCGATTGTTAAAATTGGTCGTTCAATTTTTGACGTTTATACATCACAG +TCGCGCTGATAAGGCTAACAAGGCCATCAAGTTGACGGCTTTTCCGTCGCTTGTTTTGTGGCTTAACGCT +ACGCTACCACAAAACAATCAACTACAAAGCCGCAACTTATGGCGGCGTTAGATACACTAAGCACATAATT +GCTCACAGCCAAACTATCAGGTCAAGTCTGCTTTTATTATTTTTAAGCGTGCATAATAAGCCCTAC \ No newline at end of file