annotate filter_by_fasta_ids.py @ 4:fa59d6fea7f5 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author earlhaminst
date Fri, 03 Mar 2017 07:29:32 -0500
parents 78dd29aa7fc1
children 0a189243186d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
1 #!/usr/bin/env python
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
2 """ A script to build specific fasta databases """
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
3 from __future__ import print_function
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
4
3
78dd29aa7fc1 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 1
diff changeset
5 import collections
0
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
6 import sys
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
7
3
78dd29aa7fc1 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 1
diff changeset
8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
0
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
9
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
10
3
78dd29aa7fc1 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 1
diff changeset
11 def FASTAReader_gen(fasta_filename):
4
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
12 with open(fasta_filename) as fasta_file:
3
78dd29aa7fc1 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 1
diff changeset
13 line = fasta_file.readline()
4
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
14 while True:
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
15 if not line:
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
16 return
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
17 assert line.startswith('>'), "FASTA headers must start with >"
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
18 header = line.rstrip()
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
19 sequence_parts = []
3
78dd29aa7fc1 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 1
diff changeset
20 line = fasta_file.readline()
4
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
21 while line and line[0] != '>':
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
22 sequence_parts.append(line.rstrip())
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
23 line = fasta_file.readline()
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
24 sequence = "".join(sequence_parts)
fa59d6fea7f5 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents: 3
diff changeset
25 yield Sequence(header, sequence)
0
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
26
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
27
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
28 def target_match(target, search_entry):
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
29 ''' Matches '''
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
30 search_entry = search_entry.upper()
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
31 for atarget in target:
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
32 if search_entry.find(atarget) > -1:
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
33 return atarget
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
34 return None
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
35
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
36
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
37 def main():
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
38 ''' the main function'''
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
39
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
40 used_sequences = set()
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
41 work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0}
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
42 targets = []
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
43
1
b3833e5b50d4 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
earlhaminst
parents: 0
diff changeset
44 with open(sys.argv[1]) as f_target:
b3833e5b50d4 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
earlhaminst
parents: 0
diff changeset
45 for line in f_target.readlines():
b3833e5b50d4 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
earlhaminst
parents: 0
diff changeset
46 targets.append(">%s" % line.strip().upper())
0
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
47
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
48 work_summary['wanted'] = len(targets)
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
49
3
78dd29aa7fc1 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 1
diff changeset
50 for entry in FASTAReader_gen(sys.argv[2]):
0
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
51 target_matched_results = target_match(targets, entry.header)
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
52 if target_matched_results:
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
53 work_summary['found'] += 1
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
54 targets.remove(target_matched_results)
3
78dd29aa7fc1 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents: 1
diff changeset
55 sequence = entry.sequence
0
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
56 used_sequences.add(sequence)
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
57 print(entry.header)
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
58 print(sequence)
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
59
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
60
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
61 if __name__ == "__main__":
794a6e864a96 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff changeset
62 main()