annotate extract_fasta_bins.py @ 0:1bc8fd1b3ed0 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
author iuc
date Fri, 18 Feb 2022 14:17:48 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
1 #!/usr/bin/env python
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
2
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
3 import argparse
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
4 import gzip
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
5 import os
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
6 import sys
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
7 from collections import defaultdict
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
8 from functools import partial
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
9
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
10 import pandas as pd
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
11 from Bio import SeqIO
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
12
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
13 parser = argparse.ArgumentParser(description=__doc__)
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
14
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
15 parser.add_argument('--gzipped', action='store_true', dest='gzipped', help='Input files are gzipped')
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
16 parser.add_argument("--input_fasta", action="store", dest="input_fasta", help="Input Fasta file")
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
17 parser.add_argument("--input_cluster", action="store", dest="input_cluster", help="Concoct output cluster file")
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
18 parser.add_argument("--output_path", help="Output directory")
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
19
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
20 args = parser.parse_args()
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
21
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
22 all_seqs = {}
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
23 if args.gzipped:
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
24 _open = partial(gzip.open, mode='rt')
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
25 else:
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
26 _open = open
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
27
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
28 with _open(args.input_fasta) as fh:
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
29 for seq in SeqIO.parse(fh, "fasta"):
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
30 all_seqs[seq.id] = seq
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
31
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
32 # Make sure we're reading the file as tabular!
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
33 df = pd.read_csv(args.input_cluster, sep='\t')
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
34 try:
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
35 assert df.columns[0] == 'contig_id'
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
36 assert df.columns[1] == 'cluster_id'
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
37 except AssertionError:
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
38 sys.stderr.write("ERROR! Header line was not 'contig_id, cluster_id', please adjust your input file. Exiting!\n")
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
39 sys.exit(-1)
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
40
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
41 cluster_to_contigs = defaultdict(list)
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
42 for i, row in df.iterrows():
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
43 cluster_to_contigs[row['cluster_id']].append(row['contig_id'])
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
44
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
45 for cluster_id, contig_ids in cluster_to_contigs.items():
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
46 output_file = os.path.join(args.output_path, "{0}.fa".format(cluster_id))
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
47 seqs = [all_seqs[contig_id] for contig_id in contig_ids]
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
48 with open(output_file, 'w') as ofh:
1bc8fd1b3ed0 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff changeset
49 SeqIO.write(seqs, ofh, 'fasta')