view filter_by_fasta_ids.py @ 4:fa59d6fea7f5 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/t_coffee commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author earlhaminst
date Fri, 03 Mar 2017 07:29:32 -0500
parents 78dd29aa7fc1
children 0a189243186d
line wrap: on
line source

#!/usr/bin/env python
""" A script to build specific fasta databases """
from __future__ import print_function

import collections
import sys

Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])


def FASTAReader_gen(fasta_filename):
    with open(fasta_filename) as fasta_file:
        line = fasta_file.readline()
        while True:
            if not line:
                return
            assert line.startswith('>'), "FASTA headers must start with >"
            header = line.rstrip()
            sequence_parts = []
            line = fasta_file.readline()
            while line and line[0] != '>':
                sequence_parts.append(line.rstrip())
                line = fasta_file.readline()
            sequence = "".join(sequence_parts)
            yield Sequence(header, sequence)


def target_match(target, search_entry):
    ''' Matches '''
    search_entry = search_entry.upper()
    for atarget in target:
        if search_entry.find(atarget) > -1:
            return atarget
    return None


def main():
    ''' the main function'''

    used_sequences = set()
    work_summary = {'wanted': 0, 'found': 0, 'duplicates': 0}
    targets = []

    with open(sys.argv[1]) as f_target:
        for line in f_target.readlines():
            targets.append(">%s" % line.strip().upper())

    work_summary['wanted'] = len(targets)

    for entry in FASTAReader_gen(sys.argv[2]):
        target_matched_results = target_match(targets, entry.header)
        if target_matched_results:
            work_summary['found'] += 1
            targets.remove(target_matched_results)
            sequence = entry.sequence
            used_sequences.add(sequence)
            print(entry.header)
            print(sequence)


if __name__ == "__main__":
    main()