Mercurial > repos > iuc > khmer_extract_partitions
comparison filter-below-abund.py @ 5:518ba4a77274 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/khmer commit e0cd7ae10ce97bed51594e7cc0b969a803d698b7
author | iuc |
---|---|
date | Fri, 07 Sep 2018 10:59:19 -0400 |
parents | 7d8138b4a593 |
children |
comparison
equal
deleted
inserted
replaced
4:7d8138b4a593 | 5:518ba4a77274 |
---|---|
36 from __future__ import print_function | 36 from __future__ import print_function |
37 | 37 |
38 import os | 38 import os |
39 import sys | 39 import sys |
40 | 40 |
41 import khmer | 41 import screed |
42 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter | 42 from khmer import Countgraph, ReadParser |
43 from khmer.utils import (broken_paired_reader, write_record) | |
43 | 44 |
44 WORKER_THREADS = 8 | |
45 GROUPSIZE = 100 | |
46 CUTOFF = 50 | 45 CUTOFF = 50 |
47 | 46 |
48 | 47 |
49 def main(): | 48 def main(): |
50 counting_ht = sys.argv[1] | 49 counting_ht = sys.argv[1] |
51 infiles = sys.argv[2:] | 50 infiles = sys.argv[2:] |
52 | 51 |
53 print('file with ht: %s' % counting_ht) | 52 print('file with ht: %s' % counting_ht) |
54 print('-- settings:') | |
55 print('N THREADS', WORKER_THREADS) | |
56 print('--') | |
57 | 53 |
58 print('making hashtable') | 54 print('making hashtable') |
59 ht = khmer.load_countgraph(counting_ht) | 55 ht = Countgraph.load(counting_ht) |
60 K = ht.ksize() | 56 K = ht.ksize() |
61 | 57 |
62 for infile in infiles: | 58 for infile in infiles: |
63 print('filtering', infile) | 59 print('filtering', infile) |
64 outfile = os.path.basename(infile) + '.below' | 60 outfile = os.path.basename(infile) + '.below' |
65 | 61 |
66 outfp = open(outfile, 'w') | 62 outfp = open(outfile, 'w') |
67 | 63 |
68 def process_fn(record, ht=ht): | 64 paired_iter = broken_paired_reader(ReadParser(infile), min_length=K, |
69 name = record['name'] | 65 force_single=True) |
70 seq = record['sequence'] | 66 for n, is_pair, read1, read2 in paired_iter: |
67 name = read1.name | |
68 seq = read1.sequence | |
71 if 'N' in seq: | 69 if 'N' in seq: |
72 return None, None | 70 return None, None |
73 | 71 |
74 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) | 72 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) |
75 | 73 |
76 if trim_at >= K: | 74 if trim_at >= K: |
77 return name, trim_seq | 75 write_record(screed.Record(name=name, sequence=trim_seq), outfp) |
78 | |
79 return None, None | |
80 | |
81 tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) | |
82 | |
83 tsp.start(verbose_fasta_iter(infile), outfp) | |
84 | 76 |
85 | 77 |
86 if __name__ == '__main__': | 78 if __name__ == '__main__': |
87 main() | 79 main() |