annotate RM_custom_search.py @ 21:f4ed6a65a2ff draft

Uploaded
author petr-novak
date Thu, 27 Jul 2023 09:46:13 +0000
parents 62fefa284036
children 628b235d76c7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
14
62fefa284036 Uploaded
petr-novak
parents: 11
diff changeset
1 #!/usr/bin/env python
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
2 ''' RepeatMasker search against custom database
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
3 input:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
4 - archive with sequencing data
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
5 - custom repeat database
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
6 '''
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
7 import zipfile
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
8 import shutil
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
9 import os
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
10 import subprocess
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
11 from parallel import parallel2
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
12 import glob
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
13
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
14 print(os.environ)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
15 def extract_sequences(f):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
16 # check archive:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
17 try:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
18 z=zipfile.ZipFile(f)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
19 # extract only dirCLXXXX/reads.fas
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
20 seq_list = []
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
21 for filein in z.namelist():
11
16150c85fb3a Uploaded
petr-novak
parents: 0
diff changeset
22 c1 = filein.lower().startswith("seqclust/clustering/clusters/dir_cl")
16150c85fb3a Uploaded
petr-novak
parents: 0
diff changeset
23 c2 = filein.endswith("reads.fas")
16150c85fb3a Uploaded
petr-novak
parents: 0
diff changeset
24 c3 = filein.endswith("reads.fasta") # in newer RE2 versions
16150c85fb3a Uploaded
petr-novak
parents: 0
diff changeset
25 if c1 and (c2 or c3):
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
26 outdir = filein.split("/")[3]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
27 outfile = outdir +"/reads.fas"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
28 source = z.open(filein)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
29 os.mkdir(outdir)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
30 target = open(outfile, "wb")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
31 shutil.copyfileobj(source, target)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
32 seq_list.append(outfile)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
33 if len(seq_list) == 0:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
34 raise ValueError()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
35
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
36 except zipfile.BadZipfile as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
37 print("unable to extract sequences from archive!")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
38 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
39
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
40 except IOError as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
41 print("unable to extract sequences from archive!")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
42 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
43
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
44 except ValueError as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
45 print("No sequences found in archive!")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
46 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
47
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
48 seq_list.sort()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
49 return seq_list
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
50
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
51
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
52 def RepeatMasker(reads,database):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
53 args = ["RepeatMasker", reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
54 print(args)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
55 status=subprocess.call(args , stderr = open(os.devnull, 'wb'))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
56 return status
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
57
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
58 def summarizeRepeatMaskerOutput(htmlout = "summary.html"):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
59 cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r"
14
62fefa284036 Uploaded
petr-novak
parents: 11
diff changeset
60 args = [ cmd, "dir_CL*/reads.fas", "dir_CL*/reads.fas.out", "RM-custom_output_table" ]
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
61 status=subprocess.call(args)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
62 cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
63 args = [cmd, htmlout]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
64 status=subprocess.call(args)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
65 return status
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
66
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
67
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
68 def main():
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
69 from optparse import OptionParser
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
70
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
71 parser = OptionParser()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
72 parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
73 parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
74 parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
75 parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
76
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
77 options, args = parser.parse_args()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
78 config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
79
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
80
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
81 seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
82 parallel2(RepeatMasker, seq_files, [options.database])
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
83
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
84 status = summarizeRepeatMaskerOutput(options.report)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
85
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
86
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
87
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
88 if __name__== "__main__":
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
89 main()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
90
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
91