Mercurial > repos > iuc > khmer_normalize_by_median

diff filter-below-abund.py @ 6:bfd859f04a89 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/khmer commit e0cd7ae10ce97bed51594e7cc0b969a803d698b7
author: iuc
date: Fri, 07 Sep 2018 11:01:41 -0400
parents: f0290a079720
--- a/filter-below-abund.py	Sat Jan 21 14:43:49 2017 -0500
+++ b/filter-below-abund.py	Fri Sep 07 11:01:41 2018 -0400
@@ -38,11 +38,10 @@
 import os
 import sys
 
-import khmer
-from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter
+import screed
+from khmer import Countgraph, ReadParser
+from khmer.utils import (broken_paired_reader, write_record)
 
-WORKER_THREADS = 8
-GROUPSIZE = 100
 CUTOFF = 50
 
 
@@ -51,12 +50,9 @@
     infiles = sys.argv[2:]
 
     print('file with ht: %s' % counting_ht)
-    print('-- settings:')
-    print('N THREADS', WORKER_THREADS)
-    print('--')
 
     print('making hashtable')
-    ht = khmer.load_countgraph(counting_ht)
+    ht = Countgraph.load(counting_ht)
     K = ht.ksize()
 
     for infile in infiles:
@@ -65,22 +61,18 @@
 
         outfp = open(outfile, 'w')
 
-        def process_fn(record, ht=ht):
-            name = record['name']
-            seq = record['sequence']
+        paired_iter = broken_paired_reader(ReadParser(infile), min_length=K,
+                                           force_single=True)
+        for n, is_pair, read1, read2 in paired_iter:
+            name = read1.name
+            seq = read1.sequence
             if 'N' in seq:
                 return None, None
 
             trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)
 
             if trim_at >= K:
-                return name, trim_seq
-
-            return None, None
-
-        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
-
-        tsp.start(verbose_fasta_iter(infile), outfp)
+                write_record(screed.Record(name=name, sequence=trim_seq), outfp)
 
 
 if __name__ == '__main__':
author	iuc
date	Fri, 07 Sep 2018 11:01:41 -0400
parents	f0290a079720
children