view change_fasta_header_using_tabular_file.py @ 0:540425dc9746 draft default tip

planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author portiahollyoak
date Fri, 22 Apr 2016 12:07:53 -0400
parents
children
line wrap: on
line source


# coding: utf-8
import argparse
import sys


def get_dict(tabular_file_handle):  # In this function, the file tab_file_handle is used
    """
    This function creates a dictionary out of the file provided containing TE ID names\
    and their common usage names separated by a tab
    """
    dictionary = {}  # A dictionary is named 'dictionary'
    for line in tabular_file_handle:  # For every line in the file
        line = line.strip()  # The leading and trailing white spaces are stripped
        key, value = line.split("\t")  # The line is then split where there is a tab and 
                                       # then the two results are defined as key and value
        dictionary[key] = value  # The key is linked to the value
    return dictionary  # Show dictionary on the screen


def replace_id(line, dictionary):  
# In this function, the dictionary and the lines of the fasta_file_handle are fed in.
    """
    This function reads a fasta header (line), recovers the name of the sequence "(>fasta_1)"
    and stores this in key. We look up key in the dictionary, and if the key is present,
    we replace the key in the line with the value that is assigned to the key in the
    dictionary.
    """
    key = line[1:].strip()  #The key is the ID name (not including the >)
    if key in dictionary:  #If key is in the dictionary, it is replaced
        line = line.replace(key,dictionary[key])
        return line
    print("Value  %s is not present in multifastafile" % key)
    return line


description = ( "This script will exchange fasta headers in multifasta file with values linked in tabular file")

parser = argparse.ArgumentParser(description)
parser.add_argument("--tab_input", help="A tabular file containing two linked columns separated by a tab")
parser.add_argument("--fasta_input", help="A multifasta file containing fasta headers and their sequences")
parser.add_argument("output", help="Name of the output fasta file.")
# uncomment the next line only when interactively testing!
#args = parser.parse_args(["TE_ID_Names.tsv", "TE_seq_d.fasta", "my_fancy_new_out.fasta"])
args = parser.parse_args()


python_version = sys.version_info
if python_version.major >= 3:
    kwargs = {"encoding": "utf-8"}
else:
    kwargs = {}

with open(args.tab_input, **kwargs) as tabular_file_handle:
    dictionary = get_dict(tabular_file_handle)

with open(args.fasta_input, **kwargs) as fasta_file_handle:
    with open(args.output, "w") as output:
        for line in fasta_file_handle:
           if line.startswith(">"):
                line = replace_id(line, dictionary)
           output.write(line)