Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
| author | iss |
|---|---|
| date | Mon, 21 Mar 2022 15:23:09 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c6bab5103a14 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # -*- coding: utf-8 -*- | |
| 4 | |
| 5 ''' | |
| 6 Adapted from: | |
| 7 https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py | |
| 8 mickaelsilva | |
| 9 ''' | |
| 10 | |
| 11 import sys | |
| 12 import urllib.request | |
| 13 import urllib.parse | |
| 14 import xml.etree.ElementTree as ET | |
| 15 import time | |
| 16 import argparse | |
| 17 import os | |
| 18 | |
| 19 | |
| 20 def run_seq_from_web_taxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True): | |
| 21 print('\n' + 'Searching RunIDs for ' + taxonname) | |
| 22 | |
| 23 taxonname = urllib.parse.quote(taxonname) | |
| 24 url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml" | |
| 25 try: | |
| 26 content = urllib.request.urlopen(url) | |
| 27 xml = content.read() | |
| 28 tree = ET.fromstring(xml) | |
| 29 taxonid = '' | |
| 30 except: | |
| 31 print("Ooops!There might be a problem with the ena service, try later or check if the xml is well formated" | |
| 32 " at " + url) | |
| 33 raise | |
| 34 for child in tree: | |
| 35 taxonid = child.get('taxId') | |
| 36 if (taxonid): | |
| 37 print("\n" + "Taxon ID found: " + taxonid) | |
| 38 url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + \ | |
| 39 taxonid + \ | |
| 40 "%29%22&result=read_run&display=xml" | |
| 41 | |
| 42 content = urllib.request.urlopen(url) | |
| 43 xml = content.read() | |
| 44 tree = ET.fromstring(xml) | |
| 45 | |
| 46 runid = '' | |
| 47 n = 0 | |
| 48 with open(outputfile, "wt") as f: | |
| 49 f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n") | |
| 50 model = '' | |
| 51 prjid = '' | |
| 52 length_line = 0 | |
| 53 omics = '' | |
| 54 libraryType = '' | |
| 55 for child in tree: | |
| 56 runid = child.get('accession') | |
| 57 | |
| 58 n += 1 | |
| 59 | |
| 60 if getmachine is True or getOmicsDataType is True or getLibraryType is True: | |
| 61 for child2 in child: | |
| 62 if child2.tag == 'EXPERIMENT_REF': | |
| 63 expid = child2.get('accession') | |
| 64 url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml" | |
| 65 content = urllib.request.urlopen(url2) | |
| 66 xml = content.read() | |
| 67 tree2 = ET.fromstring(xml) | |
| 68 try: | |
| 69 for child3 in tree2: | |
| 70 for child4 in child3: | |
| 71 if child4.tag == 'PLATFORM': | |
| 72 for child5 in child4: | |
| 73 for child6 in child5: | |
| 74 if child6.tag == 'INSTRUMENT_MODEL': | |
| 75 model = child6.text | |
| 76 elif child4.tag == 'STUDY_REF': | |
| 77 prjid = child4.get('accession') | |
| 78 elif child4.tag == 'DESIGN': | |
| 79 if getOmicsDataType is True or getLibraryType is True: | |
| 80 for child5 in child4: | |
| 81 if child5.tag == 'LIBRARY_DESCRIPTOR': | |
| 82 for child6 in child5: | |
| 83 if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True: | |
| 84 omics = child6.text | |
| 85 elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True: | |
| 86 libraryType = child6[0].tag | |
| 87 except: | |
| 88 model = 'not found' | |
| 89 omics = 'not found' | |
| 90 libraryType = 'not found' | |
| 91 f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n") | |
| 92 if print_True: | |
| 93 line = "run acession %s sequenced on %s from project %s for %s %s end" \ | |
| 94 " data" % (runid, model, prjid, omics, libraryType) | |
| 95 if length_line < len(line): | |
| 96 length_line = len(line) | |
| 97 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) | |
| 98 sys.stderr.flush() | |
| 99 else: | |
| 100 f.write(str(runid) + '\t' * 4 + "\n") | |
| 101 if print_True: | |
| 102 line = "run acession %s" % (runid, prjid) | |
| 103 if length_line < len(line): | |
| 104 length_line = len(line) | |
| 105 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) | |
| 106 sys.stderr.flush() | |
| 107 print("\n") | |
| 108 print("\n" | |
| 109 "found %s run id's" % n) | |
| 110 | |
| 111 else: | |
| 112 print("taxon name does not exist") | |
| 113 | |
| 114 | |
| 115 def main(): | |
| 116 parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the" | |
| 117 " sequencing was performed, given a taxon name accepted by the" | |
| 118 " European nucleotide Archive") | |
| 119 parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True) | |
| 120 parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True) | |
| 121 parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true', | |
| 122 required=False) | |
| 123 parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type' | |
| 124 ' (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output', | |
| 125 action='store_true') | |
| 126 parser.add_argument('--getLibraryType', help='Informs the programme to include library type' | |
| 127 ' (examples: PAIRED / SINGLE) in the output', action='store_true') | |
| 128 | |
| 129 args = parser.parse_args() | |
| 130 | |
| 131 getmachine = args.g | |
| 132 taxonname = args.i[0] | |
| 133 | |
| 134 outdir = os.path.dirname(os.path.abspath(args.o[0])) | |
| 135 if not os.path.isdir(outdir): | |
| 136 os.makedirs(outdir) | |
| 137 outputfile = os.path.abspath(args.o[0]) | |
| 138 | |
| 139 getOmicsDataType = args.getOmicsDataType | |
| 140 getLibraryType = args.getLibraryType | |
| 141 | |
| 142 run_seq_from_web_taxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True) | |
| 143 | |
| 144 | |
| 145 if __name__ == "__main__": | |
| 146 main() |
