comparison tn93_filter.py @ 1:cf50aeb956f2 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
author iuc
date Wed, 20 Apr 2022 16:59:05 +0000
parents ba95715078c9
children
comparison
equal deleted inserted replaced
0:ba95715078c9 1:cf50aeb956f2
1 import argparse 1 import argparse
2 import csv 2 import csv
3 import random
3 4
4 from Bio import SeqIO 5 from Bio import SeqIO
5 6
6 arguments = argparse.ArgumentParser(description='Combine alignments into a single file, adding a reference sequence as well') 7 arguments = argparse.ArgumentParser(description='Combine alignments into a single file, adding a reference sequence as well')
7 8
8 arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str) 9 arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str)
9 arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str) 10 arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str)
10 arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str) 11 arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str)
11 arguments.add_argument('-q', '--clusters', help='Compressed clusters', required=True, type=str) 12 arguments.add_argument('-q', '--clusters', help='Compressed background clusters', required=True, type=str)
12 settings = arguments.parse_args() 13 settings = arguments.parse_args()
13 14
14 reference_name = 'REFERENCE' 15 reference_name = 'REFERENCE'
15 reference_seq = '' 16 reference_seq = ''
16 17
18
19 def unique_id(new_id, existing_ids):
20 while new_id in existing_ids:
21 new_id += '_' + ''.join(random.choices('0123456789abcdef', k=10))
22 return new_id
23
24
17 with open(settings.reference) as seq_fh: 25 with open(settings.reference) as seq_fh:
18 for seq_record in SeqIO.parse(seq_fh, 'fasta'): 26 for seq_record in SeqIO.parse(seq_fh, 'fasta'):
19 reference_name = seq_record.name 27 reference_name = seq_record.name.split(' ')[0]
20 reference_seq = seq_record.seq 28 reference_seq = seq_record.seq
21 break 29 break
22 30
23 with open(settings.distances) as fh: 31 with open(settings.distances) as fh:
24 reader = csv.reader(fh, delimiter=',') 32 reader = csv.reader(fh, delimiter=',')
25 next(reader) 33 next(reader)
26 seqs_to_filter = set() 34 seqs_to_filter = set()
27 for line in reader: 35 for line in reader:
28 if line[1] not in seqs_to_filter: 36 if line[1] not in seqs_to_filter:
29 seqs_to_filter.add(line[1]) 37 seqs_to_filter.add(line[1])
38 else:
39 seqs_to_filter.add(unique_id(line[1], seqs_to_filter))
30 if reference_name in seqs_to_filter: 40 if reference_name in seqs_to_filter:
31 seqs_to_filter.remove(reference_name) 41 seqs_to_filter.remove(reference_name)
32 42
33 with open(settings.reads, "a+") as fh: 43 with open(settings.reads, "a+") as fh:
34 seqs_filtered = list() 44 seqs_filtered = list()
35 for seq_record in SeqIO.parse(settings.clusters, "fasta"): 45 for seq_record in SeqIO.parse(settings.clusters, "fasta"):
46 if seq_record.name.split(' ')[0] == reference_name:
47 continue
36 if seq_record.name not in seqs_to_filter: 48 if seq_record.name not in seqs_to_filter:
37 if seq_record.name == reference_name: 49 unique_name = unique_id(seq_record.name, seqs_filtered)
38 if seq_record.name not in seqs_filtered: 50 fh.write('\n>%s\n%s' % (unique_name, seq_record.seq))
39 seqs_filtered.append(seq_record.name) 51 seqs_filtered.append(unique_name)
40 else:
41 continue
42 if reference_name not in seqs_filtered: 52 if reference_name not in seqs_filtered:
43 fh.write('\n>REFERENCE\n%s' % reference_seq) 53 fh.write('\n>REFERENCE\n%s' % reference_seq)