annotate tn93_filter.py @ 2:b38f620a3628 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
author iuc
date Wed, 20 Apr 2022 16:59:49 +0000
parents 9d793e88e15f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
1 import argparse
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
2 import csv
2
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
3 import random
1
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
4
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
5 from Bio import SeqIO
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
6
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
7 arguments = argparse.ArgumentParser(description='Combine alignments into a single file, adding a reference sequence as well')
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
8
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
9 arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str)
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
10 arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str)
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
11 arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str)
2
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
12 arguments.add_argument('-q', '--clusters', help='Compressed background clusters', required=True, type=str)
1
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
13 settings = arguments.parse_args()
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
14
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
15 reference_name = 'REFERENCE'
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
16 reference_seq = ''
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
17
2
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
18
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
19 def unique_id(new_id, existing_ids):
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
20 while new_id in existing_ids:
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
21 new_id += '_' + ''.join(random.choices('0123456789abcdef', k=10))
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
22 return new_id
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
23
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
24
1
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
25 with open(settings.reference) as seq_fh:
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
26 for seq_record in SeqIO.parse(seq_fh, 'fasta'):
2
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
27 reference_name = seq_record.name.split(' ')[0]
1
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
28 reference_seq = seq_record.seq
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
29 break
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
30
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
31 with open(settings.distances) as fh:
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
32 reader = csv.reader(fh, delimiter=',')
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
33 next(reader)
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
34 seqs_to_filter = set()
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
35 for line in reader:
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
36 if line[1] not in seqs_to_filter:
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
37 seqs_to_filter.add(line[1])
2
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
38 else:
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
39 seqs_to_filter.add(unique_id(line[1], seqs_to_filter))
1
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
40 if reference_name in seqs_to_filter:
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
41 seqs_to_filter.remove(reference_name)
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
42
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
43 with open(settings.reads, "a+") as fh:
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
44 seqs_filtered = list()
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
45 for seq_record in SeqIO.parse(settings.clusters, "fasta"):
2
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
46 if seq_record.name.split(' ')[0] == reference_name:
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
47 continue
1
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
48 if seq_record.name not in seqs_to_filter:
2
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
49 unique_name = unique_id(seq_record.name, seqs_filtered)
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
50 fh.write('\n>%s\n%s' % (unique_name, seq_record.seq))
b38f620a3628 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents: 1
diff changeset
51 seqs_filtered.append(unique_name)
1
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
52 if reference_name not in seqs_filtered:
9d793e88e15f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff changeset
53 fh.write('\n>REFERENCE\n%s' % reference_seq)