5
|
1 #!/usr/bin/python
|
|
2
|
|
3 import optparse, os, re, shutil, sys, tempfile, glob, shlex, vcf, pysam, tarfile, csv, operator
|
|
4 from subprocess import *
|
|
5 import subprocess
|
|
6 import multiprocessing
|
|
7 from joblib import Parallel, delayed
|
|
8 from itertools import izip
|
|
9
|
|
10 NCPU = multiprocessing.cpu_count()
|
|
11 CHUNK_SIZE = 2**20 #1mb
|
|
12
|
|
13 def create_dict(files):
|
|
14 tmp=dict()
|
|
15 t1 = [os.path.basename(i) for i in files]
|
|
16 t2 = ['-'.join(i.split('-')[0:2]) for i in t1 if 'RC' not in i]
|
|
17 for i in t2:
|
|
18 for j in files:
|
|
19 if i in j and i in tmp:
|
|
20 tmp[i].append(j)
|
|
21 elif i in j and i not in tmp:
|
|
22 tmp[i] = [j]
|
|
23 else:
|
|
24 continue
|
|
25 return tmp
|
|
26
|
|
27 def file_merge(file1, file2, out_dir):
|
|
28 listF = (file1, file2)
|
|
29 outF=os.path.splitext(os.path.basename(file1))[0]
|
|
30 outExt = os.path.splitext(os.path.basename(file1))[1]
|
|
31 with open("%s/%s_merged%s" % (out_dir,outF,outExt),"wb") as fw:
|
|
32 for i in range(len(listF)):
|
|
33 with open(listF[i],"rb") as f:
|
|
34 writer = csv.writer(fw, delimiter=',', quotechar='', quoting=csv.QUOTE_NONE, escapechar='\\')
|
|
35 for i in range(len(listF)):
|
|
36 with open(listF[i],"rb") as f:
|
|
37 file_f=csv.reader(f, delimiter='\t')
|
|
38 file_f.next()
|
|
39 for line in file_f:
|
|
40 writer.writerow(line)
|
|
41
|
|
42
|
|
43 def cleanup_before_exit( tmp_dir ):
|
|
44 if tmp_dir and os.path.exists( tmp_dir ):
|
|
45 shutil.rmtree( tmp_dir )
|
|
46
|
|
47 def parallel_jobs(i, piq_dir_path, tmp_dir, inputF, out_dir, output_dir):
|
|
48 cmd=[]
|
|
49 motif=i
|
|
50 piq_cmd1 = "Rscript %s/pwmmatch.exact.r %s/common.r %s/jasparVertebrateCore2016-519-matrices.txt %d %s/; " % (piq_dir_path, piq_dir_path, piq_dir_path, motif, tmp_dir)
|
|
51 #piq_cmd1 = "Rscript %s/pwmmatch.exact.r %s/common.r %s/jasparfix.txt %d %s/; " % (piq_dir_path, piq_dir_path, piq_dir_path, motif, tmp_dir)
|
|
52 piq_cmd2 = "Rscript %s/bam2rdata.r %s/common.r %s/bam.RData %s; " % (piq_dir_path, piq_dir_path, tmp_dir, inputF)
|
|
53 piq_cmd3 = "Rscript %s/pertf.r %s/common.r %s/ %s %s/ %s/bam.RData %d; " % (piq_dir_path, piq_dir_path, tmp_dir, tmp_dir, out_dir, tmp_dir, motif)
|
|
54 piq_cmd = piq_cmd1+piq_cmd2+piq_cmd3
|
|
55 cmd.append(piq_cmd)
|
|
56 cmd="".join(cmd)
|
|
57 #print "cmd:%s" % cmd
|
|
58 stdout = tempfile.NamedTemporaryFile( prefix="piq-stdout-", dir=tmp_dir )
|
|
59 stderr = tempfile.NamedTemporaryFile( prefix="piq-stderr-", dir=tmp_dir )
|
|
60 proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=output_dir )
|
|
61 return_code = proc.wait()
|
|
62
|
|
63 if return_code:
|
|
64 stderr_target = sys.stderr
|
|
65 else:
|
|
66 stderr_target = sys.stdout
|
|
67 stderr.flush()
|
|
68 stderr.seek(0)
|
|
69 while True:
|
|
70 chunk = stderr.read( CHUNK_SIZE )
|
|
71 if chunk:
|
|
72 stderr_target.write( chunk )
|
|
73 else:
|
|
74 break
|
|
75 stderr.close()
|
|
76 stdout.close()
|
|
77
|
|
78 def __main__():
|
|
79 piq_dir_path="/mnt/galaxyTools/tools/piq/1.3/thashim"
|
|
80 parser = optparse.OptionParser()
|
|
81 parser.add_option('', '--input', dest="inputF", action='store', type="string", default=None, help='')
|
|
82 parser.add_option( '', '--output', dest='outputF', action='store', type="string", default=None, help='')
|
|
83 parser.add_option( '', '--out-dir', dest='output_dir', action='store', type="string", default=None, help='If specified, the output directory for extra files.' )
|
|
84
|
|
85 (options, args) = parser.parse_args()
|
|
86
|
|
87 if not os.path.exists(options.output_dir):
|
|
88 os.mkdir(options.output_dir)
|
|
89 out_dir = "%s/out" % options.output_dir;
|
|
90 if not os.path.exists(out_dir):
|
|
91 os.mkdir(out_dir)
|
|
92 tmp_dir = "%s/tmp" % options.output_dir;
|
|
93 if not os.path.exists(tmp_dir):
|
|
94 os.mkdir(tmp_dir)
|
|
95
|
|
96 maxMotif=519
|
|
97 Parallel(n_jobs=NCPU)(delayed(parallel_jobs)(i, piq_dir_path, tmp_dir, options.inputF, out_dir, options.output_dir) for i in range(1,maxMotif+1))
|
|
98
|
|
99 #combine files
|
|
100 csvFiles =sorted(glob.glob("%s/*calls.all.csv" % out_dir))
|
|
101 csvList=create_dict(csvFiles)
|
|
102
|
|
103 for v in csvList.values():
|
|
104 if len(v) == 2:
|
|
105 file_merge(v[0],v[1],out_dir)
|
|
106 else:
|
|
107 outF=os.path.splitext(os.path.basename(v[0]))[0]
|
|
108 outExt=os.path.splitext(os.path.basename(v[0]))[1]
|
|
109 shutil.copy(v[0],"%s/%s_merged%s" % (out_dir, outF, outExt))
|
|
110
|
|
111 bedFiles=sorted(glob.glob("%s/*calls.all.bed" % out_dir))
|
|
112 bedList=create_dict(bedFiles)
|
|
113
|
|
114 for v in bedList.values():
|
|
115 if len(v) == 2:
|
|
116 file_merge(v[0],v[1],out_dir)
|
|
117 else:
|
|
118 outF=os.path.splitext(os.path.basename(v[0]))[0]
|
|
119 outExt=os.path.splitext(os.path.basename(v[0]))[1]
|
|
120 shutil.copy(v[0],"%s/%s_merged%s" % (out_dir, outF, outExt))
|
|
121
|
|
122 m_csvFiles=sorted(glob.glob("%s/*_merged.csv" % out_dir))
|
|
123 m_bedFiles=sorted(glob.glob("%s/*_merged.bed" % out_dir))
|
|
124
|
|
125 for i in range(len(m_csvFiles)):
|
|
126 outF=os.path.splitext(os.path.basename(m_csvFiles[i]))[0]
|
|
127 #print "\noutF: %s\n" % outF
|
|
128 with open('%s/%s_all_tmp' % (out_dir,outF), 'wb') as res, open(m_bedFiles[i]) as f1, open(m_csvFiles[i]) as f2:
|
|
129 for line1, line2 in zip(f1, f2):
|
|
130 res.write("{},{}\n".format(line1.rstrip(), line2.rstrip()))
|
|
131 with open('%s/%s_all_tmp' % (out_dir,outF), "rb") as f3:
|
|
132 rdr=csv.reader(f3)
|
|
133
|
|
134 with open('%s/%s_final.bed' % (out_dir,outF),'wb') as result:
|
|
135 wtr = csv.writer(result, delimiter='\t', quotechar='', quoting=csv.QUOTE_NONE)
|
|
136 #for r in sorted_list:
|
|
137 for r in rdr:
|
|
138 if len(r) > 12:
|
|
139 wtr.writerow((r[0],r[1],r[2],r[3],r[5],r[9].replace('\\',""),r[10].replace('\\',""),r[11].replace('\\',""),r[12]))
|
|
140 filenames=sorted(glob.glob("%s/*_final.bed" % out_dir))
|
|
141 #shutil.copy('%s/*_final.bed' % out_dir, options.output_dir)
|
|
142 with open('%s/all_motif_comb_final.bed' % options.output_dir, 'w') as outfile:
|
|
143 for fname in filenames:
|
|
144 shutil.copy('%s' % fname, options.output_dir)
|
|
145 with open(fname) as infile:
|
|
146 for line in infile:
|
|
147 outfile.write(line)
|
|
148
|
|
149 shutil.copy('%s/all_motif_comb_final.bed' % options.output_dir, options.outputF)
|
|
150
|
|
151 # concatenerate calls.all and RC.calls.all for csv and bed files
|
|
152 cleanup_before_exit( options.output_dir )
|
|
153
|
|
154 if __name__=="__main__": __main__()
|
|
155
|