comparison scripts/ReMatCh/modules/seqFromWebTaxon.py @ 0:c6bab5103a14 draft

"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author iss
date Mon, 21 Mar 2022 15:23:09 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c6bab5103a14
1 #!/usr/bin/env python3
2
3 # -*- coding: utf-8 -*-
4
5 '''
6 Adapted from:
7 https://github.com/mickaelsilva/pythonscripts/blob/master/SeqOfWeb/SeqFromWebTaxon.py
8 mickaelsilva
9 '''
10
11 import sys
12 import urllib.request
13 import urllib.parse
14 import xml.etree.ElementTree as ET
15 import time
16 import argparse
17 import os
18
19
20 def run_seq_from_web_taxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, print_True):
21 print('\n' + 'Searching RunIDs for ' + taxonname)
22
23 taxonname = urllib.parse.quote(taxonname)
24 url = "http://www.ebi.ac.uk/ena/data/view/Taxon%3A" + taxonname + "&display=xml"
25 try:
26 content = urllib.request.urlopen(url)
27 xml = content.read()
28 tree = ET.fromstring(xml)
29 taxonid = ''
30 except:
31 print("Ooops!There might be a problem with the ena service, try later or check if the xml is well formated"
32 " at " + url)
33 raise
34 for child in tree:
35 taxonid = child.get('taxId')
36 if (taxonid):
37 print("\n" + "Taxon ID found: " + taxonid)
38 url = "http://www.ebi.ac.uk/ena/data/warehouse/search?query=%22tax_tree%28" + \
39 taxonid + \
40 "%29%22&result=read_run&display=xml"
41
42 content = urllib.request.urlopen(url)
43 xml = content.read()
44 tree = ET.fromstring(xml)
45
46 runid = ''
47 n = 0
48 with open(outputfile, "wt") as f:
49 f.write('#' + str(time.strftime("%d/%m/%Y")) + "\n")
50 model = ''
51 prjid = ''
52 length_line = 0
53 omics = ''
54 libraryType = ''
55 for child in tree:
56 runid = child.get('accession')
57
58 n += 1
59
60 if getmachine is True or getOmicsDataType is True or getLibraryType is True:
61 for child2 in child:
62 if child2.tag == 'EXPERIMENT_REF':
63 expid = child2.get('accession')
64 url2 = "http://www.ebi.ac.uk/ena/data/view/" + expid + "&display=xml"
65 content = urllib.request.urlopen(url2)
66 xml = content.read()
67 tree2 = ET.fromstring(xml)
68 try:
69 for child3 in tree2:
70 for child4 in child3:
71 if child4.tag == 'PLATFORM':
72 for child5 in child4:
73 for child6 in child5:
74 if child6.tag == 'INSTRUMENT_MODEL':
75 model = child6.text
76 elif child4.tag == 'STUDY_REF':
77 prjid = child4.get('accession')
78 elif child4.tag == 'DESIGN':
79 if getOmicsDataType is True or getLibraryType is True:
80 for child5 in child4:
81 if child5.tag == 'LIBRARY_DESCRIPTOR':
82 for child6 in child5:
83 if child6.tag == 'LIBRARY_SOURCE' and getOmicsDataType is True:
84 omics = child6.text
85 elif child6.tag == 'LIBRARY_LAYOUT' and getLibraryType is True:
86 libraryType = child6[0].tag
87 except:
88 model = 'not found'
89 omics = 'not found'
90 libraryType = 'not found'
91 f.write(str(runid) + "\t" + model + "\t" + prjid + "\t" + omics + "\t" + libraryType + "\n")
92 if print_True:
93 line = "run acession %s sequenced on %s from project %s for %s %s end" \
94 " data" % (runid, model, prjid, omics, libraryType)
95 if length_line < len(line):
96 length_line = len(line)
97 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line))))
98 sys.stderr.flush()
99 else:
100 f.write(str(runid) + '\t' * 4 + "\n")
101 if print_True:
102 line = "run acession %s" % (runid, prjid)
103 if length_line < len(line):
104 length_line = len(line)
105 sys.stderr.write("\r" + line + str(' ' * (length_line - len(line))))
106 sys.stderr.flush()
107 print("\n")
108 print("\n"
109 "found %s run id's" % n)
110
111 else:
112 print("taxon name does not exist")
113
114
115 def main():
116 parser = argparse.ArgumentParser(description="This program gets a list of sequencing runs and machine were the"
117 " sequencing was performed, given a taxon name accepted by the"
118 " European nucleotide Archive")
119 parser.add_argument('-i', nargs=1, type=str, help='taxon name', metavar='"Streptococcus agalactiae"', required=True)
120 parser.add_argument('-o', nargs=1, type=str, help='output file name', required=True)
121 parser.add_argument('-g', help='True to include sequencing machine in the output', action='store_true',
122 required=False)
123 parser.add_argument('--getOmicsDataType', help='Informs the programme to include OMICS data type'
124 ' (examples: GENOMIC / TRANSCRIPTOMIC / SYNTHETIC) in the output',
125 action='store_true')
126 parser.add_argument('--getLibraryType', help='Informs the programme to include library type'
127 ' (examples: PAIRED / SINGLE) in the output', action='store_true')
128
129 args = parser.parse_args()
130
131 getmachine = args.g
132 taxonname = args.i[0]
133
134 outdir = os.path.dirname(os.path.abspath(args.o[0]))
135 if not os.path.isdir(outdir):
136 os.makedirs(outdir)
137 outputfile = os.path.abspath(args.o[0])
138
139 getOmicsDataType = args.getOmicsDataType
140 getLibraryType = args.getLibraryType
141
142 run_seq_from_web_taxon(taxonname, outputfile, getmachine, getOmicsDataType, getLibraryType, True)
143
144
145 if __name__ == "__main__":
146 main()