14
|
1 #!/usr/bin/env python
|
0
|
2 ''' RepeatMasker search against custom database
|
|
3 input:
|
|
4 - archive with sequencing data
|
|
5 - custom repeat database
|
|
6 '''
|
|
7 import zipfile
|
|
8 import shutil
|
|
9 import os
|
|
10 import subprocess
|
|
11 from parallel import parallel2
|
|
12 import glob
|
|
13
|
|
14 print(os.environ)
|
|
15 def extract_sequences(f):
|
|
16 # check archive:
|
|
17 try:
|
|
18 z=zipfile.ZipFile(f)
|
|
19 # extract only dirCLXXXX/reads.fas
|
|
20 seq_list = []
|
|
21 for filein in z.namelist():
|
11
|
22 c1 = filein.lower().startswith("seqclust/clustering/clusters/dir_cl")
|
|
23 c2 = filein.endswith("reads.fas")
|
|
24 c3 = filein.endswith("reads.fasta") # in newer RE2 versions
|
|
25 if c1 and (c2 or c3):
|
0
|
26 outdir = filein.split("/")[3]
|
|
27 outfile = outdir +"/reads.fas"
|
|
28 source = z.open(filein)
|
|
29 os.mkdir(outdir)
|
|
30 target = open(outfile, "wb")
|
|
31 shutil.copyfileobj(source, target)
|
|
32 seq_list.append(outfile)
|
|
33 if len(seq_list) == 0:
|
|
34 raise ValueError()
|
|
35
|
|
36 except zipfile.BadZipfile as e:
|
|
37 print("unable to extract sequences from archive!")
|
|
38 raise e
|
|
39
|
|
40 except IOError as e:
|
|
41 print("unable to extract sequences from archive!")
|
|
42 raise e
|
|
43
|
|
44 except ValueError as e:
|
|
45 print("No sequences found in archive!")
|
|
46 raise e
|
|
47
|
|
48 seq_list.sort()
|
|
49 return seq_list
|
|
50
|
|
51
|
|
52 def RepeatMasker(reads,database):
|
|
53 args = ["RepeatMasker", reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)]
|
|
54 print(args)
|
|
55 status=subprocess.call(args , stderr = open(os.devnull, 'wb'))
|
|
56 return status
|
|
57
|
|
58 def summarizeRepeatMaskerOutput(htmlout = "summary.html"):
|
|
59 cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r"
|
14
|
60 args = [ cmd, "dir_CL*/reads.fas", "dir_CL*/reads.fas.out", "RM-custom_output_table" ]
|
0
|
61 status=subprocess.call(args)
|
|
62 cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R"
|
|
63 args = [cmd, htmlout]
|
|
64 status=subprocess.call(args)
|
|
65 return status
|
|
66
|
|
67
|
|
68 def main():
|
|
69 from optparse import OptionParser
|
|
70
|
|
71 parser = OptionParser()
|
|
72 parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive")
|
|
73 parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database")
|
|
74 parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory")
|
|
75 parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html')
|
|
76
|
|
77 options, args = parser.parse_args()
|
|
78 config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config"
|
|
79
|
|
80
|
|
81 seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING
|
|
82 parallel2(RepeatMasker, seq_files, [options.database])
|
|
83
|
|
84 status = summarizeRepeatMaskerOutput(options.report)
|
|
85
|
|
86
|
|
87
|
|
88 if __name__== "__main__":
|
|
89 main()
|
|
90
|
|
91
|