annotate RM_custom_search.py @ 7:89c5ba120b21 draft

Uploaded
author petr-novak
date Mon, 02 Dec 2019 08:41:43 -0500
parents a4cd8608ef6b
children 16150c85fb3a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
2 ''' RepeatMasker search against custom database
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
3 input:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
4 - archive with sequencing data
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
5 - custom repeat database
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
6 '''
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
7 import zipfile
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
8 import shutil
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
9 import os
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
10 import subprocess
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
11 from parallel import parallel2
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
12 import glob
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
13
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
14 print(os.environ)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
15 def extract_sequences(f):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
16 # check archive:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
17 try:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
18 z=zipfile.ZipFile(f)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
19 # extract only dirCLXXXX/reads.fas
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
20 seq_list = []
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
21 for filein in z.namelist():
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
22 if filein.lower().startswith("seqclust/clustering/clusters/dir_cl") and filein.endswith("reads.fas"):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
23 outdir = filein.split("/")[3]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
24 outfile = outdir +"/reads.fas"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
25 source = z.open(filein)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
26 os.mkdir(outdir)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
27 target = open(outfile, "wb")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
28 shutil.copyfileobj(source, target)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
29 seq_list.append(outfile)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
30 if len(seq_list) == 0:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
31 raise ValueError()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
32
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
33 except zipfile.BadZipfile as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
34 print("unable to extract sequences from archive!")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
35 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
36
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
37 except IOError as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
38 print("unable to extract sequences from archive!")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
39 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
40
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
41 except ValueError as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
42 print("No sequences found in archive!")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
43 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
44
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
45 seq_list.sort()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
46 return seq_list
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
47
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
48
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
49 def RepeatMasker(reads,database):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
50 args = ["RepeatMasker", reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
51 print(args)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
52 status=subprocess.call(args , stderr = open(os.devnull, 'wb'))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
53 return status
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
54
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
55 def summarizeRepeatMaskerOutput(htmlout = "summary.html"):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
56 cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
57 args = [ cmd, "-f", "dir_CL*/reads.fas", "-r", "dir_CL*/reads.fas.out", "-o", "RM-custom_output_table" ]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
58 status=subprocess.call(args)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
59 cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
60 args = [cmd, htmlout]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
61 status=subprocess.call(args)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
62 return status
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
63
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
64
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
65 def main():
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
66 from optparse import OptionParser
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
67
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
68 parser = OptionParser()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
69 parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
70 parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
71 parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
72 parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
73
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
74 options, args = parser.parse_args()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
75 config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
76
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
77
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
78 seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
79 parallel2(RepeatMasker, seq_files, [options.database])
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
80
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
81 status = summarizeRepeatMaskerOutput(options.report)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
82
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
83
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
84
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
85 if __name__== "__main__":
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
86 main()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
87
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
88