comparison RM_custom_search.py.bak @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a4cd8608ef6b
1 #!/usr/bin/env python
2 ''' RepeatMasker search against custom database
3 input:
4 - archive with sequencing data
5 - custom repeat database
6 '''
7 import zipfile
8 import shutil
9 import os
10 import subprocess
11 import parallel
12 import glob
13
14 def extract_sequences(f):
15 # check archive:
16 try:
17 z=zipfile.ZipFile(f)
18 # extract only dirCLXXXX/reads.fas
19 seq_list = []
20 for filein in z.namelist():
21 if filein.lower().startswith("seqclust/clustering/clusters/dir_cl") and filein.endswith("reads.fas"):
22 outdir = filein.split("/")[3]
23 outfile = outdir +"/reads.fas"
24 source = z.open(filein)
25 os.mkdir(outdir)
26 target = file(outfile, "wb")
27 shutil.copyfileobj(source, target)
28 seq_list.append(outfile)
29 if len(seq_list) == 0:
30 raise ValueError()
31
32 except zipfile.BadZipfile as e:
33 print "unable to extract sequences from archive!"
34 raise e
35
36 except IOError as e:
37 print "unable to extract sequences from archive!"
38 raise e
39
40 except ValueError as e:
41 print "No sequences found in archive!"
42 raise e
43
44 seq_list.sort()
45 return seq_list
46
47 def get_RM_dir(config_file,galaxy_dir):
48 shutil.copy(config_file,"seqclust.config")
49 f = open("seqclust.config",'a')
50 f.write("\necho $REPEAT_MASKER")
51 f.close()
52 args = ["bash", "seqclust.config"]
53 p = subprocess.Popen(args,stdout = subprocess.PIPE)
54 RMdir = "{0}{1}".format(galaxy_dir,p.stdout.readline().strip())
55 return RMdir
56
57 def RepeatMasker(RM,reads,database):
58 args = [RM, reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)]
59 status=subprocess.call(args , stderr = open(os.devnull, 'wb'))
60 return status
61
62 def summarizeRepeatMaskerOutput(htmlout = "summary.html"):
63 cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r"
64 args = [ cmd, "-f", "dir_CL*/reads.fas", "-r", "dir_CL*/reads.fas.out", "-o", "RM-custom_output_table" ]
65 status=subprocess.call(args)
66 cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R"
67 args = [cmd, htmlout]
68 status=subprocess.call(args)
69 return status
70
71
72 def main():
73 from optparse import OptionParser
74
75 parser = OptionParser()
76 parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive")
77 parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database")
78 parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory")
79 parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html')
80
81 options, args = parser.parse_args()
82 config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config"
83
84
85 seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING
86 RMdir = get_RM_dir(config_file, options.galaxy_dir)
87 parallel.parallel(RepeatMasker, [RMdir+"/RepeatMasker"], seq_files, [options.database])
88
89 status = summarizeRepeatMaskerOutput(options.report)
90
91
92
93 if __name__== "__main__":
94 main()
95
96