view filter_by_fasta_ids.py @ 7:0a189243186d draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 88e1592fa528db1941bce52c588d4347a69cc745
author earlhaminst
date Wed, 18 Sep 2019 06:37:40 -0400
parents fa59d6fea7f5
children
line wrap: on
line source

#!/usr/bin/env python
""" A script to build specific fasta databases """
from __future__ import print_function

import collections
import sys

Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])


def FASTAReader_gen(fasta_filename):
    with open(fasta_filename) as fasta_file:
        line = fasta_file.readline()
        while True:
            if not line:
                return
            assert line.startswith('>'), "FASTA headers must start with >"
            header = line.rstrip()
            sequence_parts = []
            line = fasta_file.readline()
            while line and line[0] != '>':
                sequence_parts.append(line.rstrip())
                line = fasta_file.readline()
            sequence = "".join(sequence_parts)
            yield Sequence(header, sequence)


def target_match(target, search_entry):
    ''' Matches '''
    search_entry = search_entry.upper()
    for atarget in target:
        if search_entry.find(atarget) > -1:
            return atarget
    return None


def main():
    used_sequences = set()
    work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0}

    with open(sys.argv[1]) as f_target:
        targets = [">%s" % _.strip().upper() for _ in f_target]

    work_summary['wanted'] = len(targets)

    for entry in FASTAReader_gen(sys.argv[2]):
        target_matched_results = target_match(targets, entry.header)
        if target_matched_results:
            work_summary['found'] += 1
            targets.remove(target_matched_results)
            sequence = entry.sequence
            used_sequences.add(sequence)
            print(entry.header)
            print(sequence)


if __name__ == "__main__":
    main()