Mercurial > repos > iss > eurl_vtec_wgs_pt
diff scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author | iss |
---|---|
date | Mon, 21 Mar 2022 15:23:09 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/ReMatCh/modules/seqFromWebTaxon.py Mon Mar 21 15:23:09 2022 +0000 @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +# -*- coding: utf-8 -*- + +''' +Adapted from: +https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py +mickaelsilva +''' + +import sys +import urllib.request +import urllib.parse +import xml.etree.ElementTree as ET +import time +import argparse +import os + + +def run_seq_from_web_taxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True): + print('\n' + 'Searching RunIDs for ' + taxonname) + + taxonname = urllib.parse.quote(taxonname) + url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml" + try: + content = urllib.request.urlopen(url) + xml = content.read() + tree = ET.fromstring(xml) + taxonid = '' + except: + print("Ooops!There might be a problem with the ena service, try later or check if the xml is well formated" + " at " + url) + raise + for child in tree: + taxonid = child.get('taxId') + if (taxonid): + print("\n" + "Taxon ID found: " + taxonid) + url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + \ + taxonid + \ + "%29%22&result=read_run&display=xml" + + content = urllib.request.urlopen(url) + xml = content.read() + tree = ET.fromstring(xml) + + runid = '' + n = 0 + with open(outputfile, "wt") as f: + f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n") + model = '' + prjid = '' + length_line = 0 + omics = '' + libraryType = '' + for child in tree: + runid = child.get('accession') + + n += 1 + + if getmachine is True or getOmicsDataType is True or getLibraryType is True: + for child2 in child: + if child2.tag == 'EXPERIMENT_REF': + expid = child2.get('accession') + url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml" + content = urllib.request.urlopen(url2) + xml = content.read() + tree2 = ET.fromstring(xml) + try: + for child3 in tree2: + for child4 in child3: + if child4.tag == 'PLATFORM': + for child5 in child4: + for child6 in child5: + if child6.tag == 'INSTRUMENT_MODEL': + model = child6.text + elif child4.tag == 'STUDY_REF': + prjid = child4.get('accession') + elif child4.tag == 'DESIGN': + if getOmicsDataType is True or getLibraryType is True: + for child5 in child4: + if child5.tag == 'LIBRARY_DESCRIPTOR': + for child6 in child5: + if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True: + omics = child6.text + elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True: + libraryType = child6[0].tag + except: + model = 'not found' + omics = 'not found' + libraryType = 'not found' + f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n") + if print_True: + line = "run acession %s sequenced on %s from project %s for %s %s end" \ + " data" % (runid, model, prjid, omics, libraryType) + if length_line < len(line): + length_line = len(line) + sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) + sys.stderr.flush() + else: + f.write(str(runid) + '\t' * 4 + "\n") + if print_True: + line = "run acession %s" % (runid, prjid) + if length_line < len(line): + length_line = len(line) + sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) + sys.stderr.flush() + print("\n") + print("\n" + "found %s run id's" % n) + + else: + print("taxon name does not exist") + + +def main(): + parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the" + " sequencing was performed, given a taxon name accepted by the" + " European nucleotide Archive") + parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True) + parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True) + parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true', + required=False) + parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type' + ' (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output', + action='store_true') + parser.add_argument('--getLibraryType', help='Informs the programme to include library type' + ' (examples: PAIRED / SINGLE) in the output', action='store_true') + + args = parser.parse_args() + + getmachine = args.g + taxonname = args.i[0] + + outdir = os.path.dirname(os.path.abspath(args.o[0])) + if not os.path.isdir(outdir): + os.makedirs(outdir) + outputfile = os.path.abspath(args.o[0]) + + getOmicsDataType = args.getOmicsDataType + getLibraryType = args.getLibraryType + + run_seq_from_web_taxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True) + + +if __name__ == "__main__": + main()