Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author | iss |
---|---|
date | Mon, 21 Mar 2022 15:23:09 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6bab5103a14 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # -*- coding: utf-8 -*- | |
4 | |
5 ''' | |
6 Adapted from: | |
7 https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py | |
8 mickaelsilva | |
9 ''' | |
10 | |
11 import sys | |
12 import urllib.request | |
13 import urllib.parse | |
14 import xml.etree.ElementTree as ET | |
15 import time | |
16 import argparse | |
17 import os | |
18 | |
19 | |
20 def run_seq_from_web_taxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True): | |
21 print('\n' + 'Searching RunIDs for ' + taxonname) | |
22 | |
23 taxonname = urllib.parse.quote(taxonname) | |
24 url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml" | |
25 try: | |
26 content = urllib.request.urlopen(url) | |
27 xml = content.read() | |
28 tree = ET.fromstring(xml) | |
29 taxonid = '' | |
30 except: | |
31 print("Ooops!There might be a problem with the ena service, try later or check if the xml is well formated" | |
32 " at " + url) | |
33 raise | |
34 for child in tree: | |
35 taxonid = child.get('taxId') | |
36 if (taxonid): | |
37 print("\n" + "Taxon ID found: " + taxonid) | |
38 url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + \ | |
39 taxonid + \ | |
40 "%29%22&result=read_run&display=xml" | |
41 | |
42 content = urllib.request.urlopen(url) | |
43 xml = content.read() | |
44 tree = ET.fromstring(xml) | |
45 | |
46 runid = '' | |
47 n = 0 | |
48 with open(outputfile, "wt") as f: | |
49 f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n") | |
50 model = '' | |
51 prjid = '' | |
52 length_line = 0 | |
53 omics = '' | |
54 libraryType = '' | |
55 for child in tree: | |
56 runid = child.get('accession') | |
57 | |
58 n += 1 | |
59 | |
60 if getmachine is True or getOmicsDataType is True or getLibraryType is True: | |
61 for child2 in child: | |
62 if child2.tag == 'EXPERIMENT_REF': | |
63 expid = child2.get('accession') | |
64 url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml" | |
65 content = urllib.request.urlopen(url2) | |
66 xml = content.read() | |
67 tree2 = ET.fromstring(xml) | |
68 try: | |
69 for child3 in tree2: | |
70 for child4 in child3: | |
71 if child4.tag == 'PLATFORM': | |
72 for child5 in child4: | |
73 for child6 in child5: | |
74 if child6.tag == 'INSTRUMENT_MODEL': | |
75 model = child6.text | |
76 elif child4.tag == 'STUDY_REF': | |
77 prjid = child4.get('accession') | |
78 elif child4.tag == 'DESIGN': | |
79 if getOmicsDataType is True or getLibraryType is True: | |
80 for child5 in child4: | |
81 if child5.tag == 'LIBRARY_DESCRIPTOR': | |
82 for child6 in child5: | |
83 if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True: | |
84 omics = child6.text | |
85 elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True: | |
86 libraryType = child6[0].tag | |
87 except: | |
88 model = 'not found' | |
89 omics = 'not found' | |
90 libraryType = 'not found' | |
91 f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n") | |
92 if print_True: | |
93 line = "run acession %s sequenced on %s from project %s for %s %s end" \ | |
94 " data" % (runid, model, prjid, omics, libraryType) | |
95 if length_line < len(line): | |
96 length_line = len(line) | |
97 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) | |
98 sys.stderr.flush() | |
99 else: | |
100 f.write(str(runid) + '\t' * 4 + "\n") | |
101 if print_True: | |
102 line = "run acession %s" % (runid, prjid) | |
103 if length_line < len(line): | |
104 length_line = len(line) | |
105 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line)))) | |
106 sys.stderr.flush() | |
107 print("\n") | |
108 print("\n" | |
109 "found %s run id's" % n) | |
110 | |
111 else: | |
112 print("taxon name does not exist") | |
113 | |
114 | |
115 def main(): | |
116 parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the" | |
117 " sequencing was performed, given a taxon name accepted by the" | |
118 " European nucleotide Archive") | |
119 parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True) | |
120 parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True) | |
121 parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true', | |
122 required=False) | |
123 parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type' | |
124 ' (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output', | |
125 action='store_true') | |
126 parser.add_argument('--getLibraryType', help='Informs the programme to include library type' | |
127 ' (examples: PAIRED / SINGLE) in the output', action='store_true') | |
128 | |
129 args = parser.parse_args() | |
130 | |
131 getmachine = args.g | |
132 taxonname = args.i[0] | |
133 | |
134 outdir = os.path.dirname(os.path.abspath(args.o[0])) | |
135 if not os.path.isdir(outdir): | |
136 os.makedirs(outdir) | |
137 outputfile = os.path.abspath(args.o[0]) | |
138 | |
139 getOmicsDataType = args.getOmicsDataType | |
140 getLibraryType = args.getLibraryType | |
141 | |
142 run_seq_from_web_taxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True) | |
143 | |
144 | |
145 if __name__ == "__main__": | |
146 main() |