Mercurial > repos > earlhaminst > t_coffee
annotate filter_by_fasta_ids.py @ 3:78dd29aa7fc1 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
author | earlhaminst |
---|---|
date | Mon, 20 Feb 2017 06:25:50 -0500 |
parents | b3833e5b50d4 |
children | fa59d6fea7f5 |
rev | line source |
---|---|
0
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
1 #!/usr/bin/env python |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
2 """ A script to build specific fasta databases """ |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
3 from __future__ import print_function |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
4 |
3
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
5 import collections |
0
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
6 import sys |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
7 |
3
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) |
0
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
9 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
10 |
3
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
11 def FASTAReader_gen(fasta_filename): |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
12 fasta_file = open(fasta_filename) |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
13 line = fasta_file.readline() |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
14 while True: |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
15 if not line: |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
16 return |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
17 assert line.startswith('>'), "FASTA headers must start with >" |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
18 header = line.rstrip() |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
19 sequence_parts = [] |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
20 line = fasta_file.readline() |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
21 while line and line[0] != '>': |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
22 sequence_parts.append(line.rstrip()) |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
23 line = fasta_file.readline() |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
24 sequence = "".join(sequence_parts) |
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
25 yield Sequence(header, sequence) |
0
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
26 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
27 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
28 def target_match(target, search_entry): |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
29 ''' Matches ''' |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
30 search_entry = search_entry.upper() |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
31 for atarget in target: |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
32 if search_entry.find(atarget) > -1: |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
33 return atarget |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
34 return None |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
35 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
36 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
37 def main(): |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
38 ''' the main function''' |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
39 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
40 used_sequences = set() |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
41 work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0} |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
42 targets = [] |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
43 |
1
b3833e5b50d4
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
earlhaminst
parents:
0
diff
changeset
|
44 with open(sys.argv[1]) as f_target: |
b3833e5b50d4
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
earlhaminst
parents:
0
diff
changeset
|
45 for line in f_target.readlines(): |
b3833e5b50d4
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 011cabb2a2b3237bbbc4850ed26972816702a2ba-dirty
earlhaminst
parents:
0
diff
changeset
|
46 targets.append(">%s" % line.strip().upper()) |
0
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
47 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
48 work_summary['wanted'] = len(targets) |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
49 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
50 # output = open(sys.argv[3], "w") |
3
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
51 for entry in FASTAReader_gen(sys.argv[2]): |
0
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
52 target_matched_results = target_match(targets, entry.header) |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
53 if target_matched_results: |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
54 work_summary['found'] += 1 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
55 targets.remove(target_matched_results) |
3
78dd29aa7fc1
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 81a1e79dda127d1afc16c7e456bbec16093a3c3f-dirty
earlhaminst
parents:
1
diff
changeset
|
56 sequence = entry.sequence |
0
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
57 used_sequences.add(sequence) |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
58 print(entry.header) |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
59 print(sequence) |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
60 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
61 |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
62 if __name__ == "__main__": |
794a6e864a96
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 230ae552ddeb1bfdef3a09becaa5c6d373529a05-dirty
earlhaminst
parents:
diff
changeset
|
63 main() |