diff nested_collection.py @ 1:86770eea5b09 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
author galaxyp
date Sat, 04 Mar 2017 20:36:03 -0500
parents
children 7a0951d0e13e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nested_collection.py	Sat Mar 04 20:36:03 2017 -0500
@@ -0,0 +1,52 @@
+import argparse
+import os
+import re
+from collections import OrderedDict
+
+
+def get_filename_index_with_identifier(realnames, pool_id):
+    pool_indices = []
+    for index, fn in enumerate(realnames):
+        if re.search(pool_id, fn) is not None:
+            pool_indices.append(index)
+    return pool_indices
+
+
+def get_batches_of_galaxyfiles(realnames, batchsize, pool_ids):
+    """For an amount of input files, pool identifiers and a batch size,
+    return batches of files for a list of lists"""
+    if pool_ids:
+        filegroups = OrderedDict([(p_id, get_filename_index_with_identifier(
+                                   realnames, p_id)) for p_id in pool_ids])
+    else:
+        filegroups = {1: range(len(realnames))}
+    batch = []
+    for pool_id, grouped_indices in filegroups.items():
+        if pool_id == 1:
+            pool_id = 'pool0'
+        for index in grouped_indices:
+            batch.append(index)
+            if batchsize and len(batch) == int(batchsize):
+                yield pool_id, batch
+                batch = []
+        if len(batch) > 0:
+            yield pool_id, batch
+            batch = []
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batchsize', dest='batchsize', default=False)
+    parser.add_argument('--real-names', dest='realnames', nargs='+')
+    parser.add_argument('--galaxy-files', dest='galaxyfiles', nargs='+')
+    parser.add_argument('--pool-ids', dest='poolids', nargs='+', default=False)
+    args = parser.parse_args()
+    for batchcount, (pool_id, batch) in enumerate(get_batches_of_galaxyfiles(
+            args.realnames, args.batchsize, args.poolids)):
+        for fncount, batchfile in enumerate([args.galaxyfiles[index] for index in batch]):
+            dsetname = '{}___batch{}_inputfn{}.mzid'.format(pool_id, batchcount, fncount)
+            print('producing', dsetname)
+            os.symlink(batchfile, dsetname)
+
+if __name__ == '__main__':
+    main()