annotate RM_custom_search.py.bak @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
2 ''' RepeatMasker search against custom database
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
3 input:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
4 - archive with sequencing data
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
5 - custom repeat database
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
6 '''
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
7 import zipfile
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
8 import shutil
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
9 import os
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
10 import subprocess
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
11 import parallel
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
12 import glob
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
13
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
14 def extract_sequences(f):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
15 # check archive:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
16 try:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
17 z=zipfile.ZipFile(f)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
18 # extract only dirCLXXXX/reads.fas
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
19 seq_list = []
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
20 for filein in z.namelist():
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
21 if filein.lower().startswith("seqclust/clustering/clusters/dir_cl") and filein.endswith("reads.fas"):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
22 outdir = filein.split("/")[3]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
23 outfile = outdir +"/reads.fas"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
24 source = z.open(filein)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
25 os.mkdir(outdir)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
26 target = file(outfile, "wb")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
27 shutil.copyfileobj(source, target)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
28 seq_list.append(outfile)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
29 if len(seq_list) == 0:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
30 raise ValueError()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
31
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
32 except zipfile.BadZipfile as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
33 print "unable to extract sequences from archive!"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
34 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
35
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
36 except IOError as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
37 print "unable to extract sequences from archive!"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
38 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
39
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
40 except ValueError as e:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
41 print "No sequences found in archive!"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
42 raise e
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
43
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
44 seq_list.sort()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
45 return seq_list
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
46
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
47 def get_RM_dir(config_file,galaxy_dir):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
48 shutil.copy(config_file,"seqclust.config")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
49 f = open("seqclust.config",'a')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
50 f.write("\necho $REPEAT_MASKER")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
51 f.close()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
52 args = ["bash", "seqclust.config"]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
53 p = subprocess.Popen(args,stdout = subprocess.PIPE)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
54 RMdir = "{0}{1}".format(galaxy_dir,p.stdout.readline().strip())
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
55 return RMdir
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
56
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
57 def RepeatMasker(RM,reads,database):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
58 args = [RM, reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
59 status=subprocess.call(args , stderr = open(os.devnull, 'wb'))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
60 return status
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
61
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
62 def summarizeRepeatMaskerOutput(htmlout = "summary.html"):
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
63 cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
64 args = [ cmd, "-f", "dir_CL*/reads.fas", "-r", "dir_CL*/reads.fas.out", "-o", "RM-custom_output_table" ]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
65 status=subprocess.call(args)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
66 cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
67 args = [cmd, htmlout]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
68 status=subprocess.call(args)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
69 return status
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
70
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
71
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
72 def main():
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
73 from optparse import OptionParser
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
74
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
75 parser = OptionParser()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
76 parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
77 parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
78 parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
79 parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
80
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
81 options, args = parser.parse_args()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
82 config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
83
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
84
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
85 seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
86 RMdir = get_RM_dir(config_file, options.galaxy_dir)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
87 parallel.parallel(RepeatMasker, [RMdir+"/RepeatMasker"], seq_files, [options.database])
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
88
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
89 status = summarizeRepeatMaskerOutput(options.report)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
90
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
91
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
92
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
93 if __name__== "__main__":
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
94 main()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
95
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
96