comparison filter-below-abund.py @ 6:bfd859f04a89 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/khmer commit e0cd7ae10ce97bed51594e7cc0b969a803d698b7
author iuc
date Fri, 07 Sep 2018 11:01:41 -0400
parents f0290a079720
children
comparison
equal deleted inserted replaced
5:f0290a079720 6:bfd859f04a89
36 from __future__ import print_function 36 from __future__ import print_function
37 37
38 import os 38 import os
39 import sys 39 import sys
40 40
41 import khmer 41 import screed
42 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter 42 from khmer import Countgraph, ReadParser
43 from khmer.utils import (broken_paired_reader, write_record)
43 44
44 WORKER_THREADS = 8
45 GROUPSIZE = 100
46 CUTOFF = 50 45 CUTOFF = 50
47 46
48 47
49 def main(): 48 def main():
50 counting_ht = sys.argv[1] 49 counting_ht = sys.argv[1]
51 infiles = sys.argv[2:] 50 infiles = sys.argv[2:]
52 51
53 print('file with ht: %s' % counting_ht) 52 print('file with ht: %s' % counting_ht)
54 print('-- settings:')
55 print('N THREADS', WORKER_THREADS)
56 print('--')
57 53
58 print('making hashtable') 54 print('making hashtable')
59 ht = khmer.load_countgraph(counting_ht) 55 ht = Countgraph.load(counting_ht)
60 K = ht.ksize() 56 K = ht.ksize()
61 57
62 for infile in infiles: 58 for infile in infiles:
63 print('filtering', infile) 59 print('filtering', infile)
64 outfile = os.path.basename(infile) + '.below' 60 outfile = os.path.basename(infile) + '.below'
65 61
66 outfp = open(outfile, 'w') 62 outfp = open(outfile, 'w')
67 63
68 def process_fn(record, ht=ht): 64 paired_iter = broken_paired_reader(ReadParser(infile), min_length=K,
69 name = record['name'] 65 force_single=True)
70 seq = record['sequence'] 66 for n, is_pair, read1, read2 in paired_iter:
67 name = read1.name
68 seq = read1.sequence
71 if 'N' in seq: 69 if 'N' in seq:
72 return None, None 70 return None, None
73 71
74 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) 72 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)
75 73
76 if trim_at >= K: 74 if trim_at >= K:
77 return name, trim_seq 75 write_record(screed.Record(name=name, sequence=trim_seq), outfp)
78
79 return None, None
80
81 tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
82
83 tsp.start(verbose_fasta_iter(infile), outfp)
84 76
85 77
86 if __name__ == '__main__': 78 if __name__ == '__main__':
87 main() 79 main()