Mercurial > repos > cstrittmatter > mitokmer
view kmer_read_m3.py @ 1:1434bc7b4786 draft
planemo upload commit 03463f4b0598df5619def5230de3fb758b4090ba-dirty
author | cstrittmatter |
---|---|
date | Mon, 22 Apr 2019 09:37:19 -0400 |
parents | 1472b4f4fbfe |
children | c9d98f5bc240 |
line wrap: on
line source
#Reads fastq filenames, runs kmerread, and generates report #MKM 1/1/2019 import re import sys #import numpy as np -- removed numpy requirement from subprocess import Popen, PIPE if __name__ == '__main__': if len(sys.argv) > 2: item = sys.argv[1].strip() dirnm = sys.argv[2].strip() if item == "-w": wdir = dirnm else: print "w?" if len(sys.argv) > 4: item = sys.argv[3].strip() dirnm = sys.argv[4].strip() if item == "-d": oname = dirnm + "/mitokmer_result.csv" else: print "d?" if len(sys.argv) > 7: item = sys.argv[5].strip() filenm1 = sys.argv[6].strip() filenm2 = sys.argv[7].strip() if item == "-i": file1 = filenm1 file2 = filenm2 else: print "i?" target = 0 wdir += "/" reffile = wdir + "mitochondria_refkey.txt" cname = wdir + "result.txt" exclude_b = [] #read in reference names and counts factor_list = [] name_list = [] in_use = [] data_file = open(reffile, 'r') data = ''.join(data_file.readlines()) data_file.close() lines = data.split('\n') lines.pop(0) #header for line in lines: if len(line) > 1: target, name, count, hit, tested, gsize, nstrains = line.split('\t') row = name.split('_') target = int(target) hit = float(hit) if nstrains != '0': gensize = float(gsize) / float(nstrains) else: gensize = 1.0 tested = float(tested) use = "1" if target in exclude_b or count < 10.0 or hit < 10.0 or len(row) < 5: use = "0" in_use.append(int(use)) if use == "1": name_list.append(name) factor_list.append(tested / hit / gensize) #factor_arr = np.array(factor_list) num_targs = len(name_list) process = Popen([wdir + '/kmerread', '-wdir', wdir, '-f1', file1, '-f2', file2]) (stdout, stderr) = process.communicate() #read in output files noid_list = [] num_cols = 1 #only one sample, no matrix needed #m = np.zeros((num_targs,num_cols)) m = [0.0 for _ in range(num_targs)] col = 0 data_file = open(cname, 'r') data = ''.join(data_file.readlines()) data_file.close() lines = data.split('\n') read_ct = 0.0 index = 0 for line in lines: if len(line) > 1: t_s, count, uniq = line.split(',') target = int(t_s) read_ct += float(count) if target > 0: if in_use[target]: m[index] = float(count) index += 1 else: noid_list.append(int(count)) #b = m * factor_arr[:,None] #normalize each row by kmer coverage #sums = np.sum(b, axis=0) #b = b / sums[None,:] #b = b * 100.0 sum = 0.0 b = [] for i in range(num_targs): b1 = m[i] * factor_list[i] sum += b1 b.append(b1) sum /= 100.0 for i in range(num_targs): b[i] /=sum #rowmax = b.max(axis=1) out_file = open(oname, 'w') output = "taxid,reads,abundance\n" out_file.write(output) output = "total," #for i in range(num_cols): output += str(read_ct) + ",," output += "\n" out_file.write(output) output = "no_id," for i in range(num_cols): output += str(noid_list[i]) + ",," output += "\n" out_file.write(output) for i in range(num_targs): #l = order_row[i] if m[i] > 0: #only output non-zero results output = name_list[i] for j in range(num_cols): output += ',' + "{0:.0f}".format(m[i]) + ',' + "{0:.3f}".format(b[i]) output += "\n" out_file.write(output) out_file.close()