view filter_by_fasta_ids.py @ 8:ae69d14b6fbf draft default tip

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 28bbc172f28d9fbe7ed2795043ff61d9e0642d13"
author earlhaminst
date Thu, 14 Jan 2021 12:14:52 +0000
parents 0a189243186d
children
line wrap: on
line source

#!/usr/bin/env python
""" A script to build specific fasta databases """
from __future__ import print_function

import collections
import sys

Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])


def FASTAReader_gen(fasta_filename):
    with open(fasta_filename) as fasta_file:
        line = fasta_file.readline()
        while True:
            if not line:
                return
            assert line.startswith('>'), "FASTA headers must start with >"
            header = line.rstrip()
            sequence_parts = []
            line = fasta_file.readline()
            while line and line[0] != '>':
                sequence_parts.append(line.rstrip())
                line = fasta_file.readline()
            sequence = "".join(sequence_parts)
            yield Sequence(header, sequence)


def target_match(target, search_entry):
    ''' Matches '''
    search_entry = search_entry.upper()
    for atarget in target:
        if search_entry.find(atarget) > -1:
            return atarget
    return None


def main():
    used_sequences = set()
    work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0}

    with open(sys.argv[1]) as f_target:
        targets = [">%s" % _.strip().upper() for _ in f_target]

    work_summary['wanted'] = len(targets)

    for entry in FASTAReader_gen(sys.argv[2]):
        target_matched_results = target_match(targets, entry.header)
        if target_matched_results:
            work_summary['found'] += 1
            targets.remove(target_matched_results)
            sequence = entry.sequence
            used_sequences.add(sequence)
            print(entry.header)
            print(sequence)


if __name__ == "__main__":
    main()