Mercurial > repos > p.lucas > get_major_minor
changeset 1:54d44429c585 draft
Uploaded script
author | p.lucas |
---|---|
date | Tue, 11 Jun 2024 10:05:47 +0000 |
parents | eb1a6c778688 |
children | 908024e80b3d |
files | VCF_create_major_and_minor_reference.py |
diffstat | 1 files changed, 136 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VCF_create_major_and_minor_reference.py Tue Jun 11 10:05:47 2024 +0000 @@ -0,0 +1,136 @@ +#!/usr/python3 +# IMPORT +import os +import argparse +import sys +import re +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq +from Bio.Data import IUPACData + +""" +Get vcf file to create dict with major and minor mutation +Get fasta file with reference sequence used to get the vcf file +Create 2 reference files, one with all major mutation +and one with minor mutation. +""" + + +# FUNCTION TO READ VCF FILE +def CatchVCFMut(name_seq_ref, vcf_file): + final_dict = {} + # all iupac dict + iupac = {v: k for k, v in IUPACData.ambiguous_dna_values.items()} + # Store vcf info et get major and minor base for each SNP + for line in open(vcf_file): + li = line.strip() + if not li.startswith("#"): + # print(li) + name_seq = li.split("\t")[0] + if name_seq == name_seq_ref: + pos = str(li.split("\t")[1]) + ref = li.split("\t")[3] + alt = li.split("\t")[4] + if not ref or not alt: + continue + if bool(re.search("[A-Z]", ref)) is False or bool(re.search("[A-Z]", alt)) is False: + print(li) + continue + depth = int(re.findall("DP=[0-9]+;", li)[0].split("=")[1].split(";")[0]) + freq_alt = round(float(re.findall("AF=[0-9.]+;", li)[0].split("=")[1].split(";")[0])*100, 2) + counting_list = re.findall("DP4=.*$", li)[0].split("=")[1].split(",") + freq_ref = round((int(counting_list[0]) + int(counting_list[1]))*100/depth, 2) + # Cas où la position est déjà enregistree + if pos in final_dict.keys(): + old_majo = final_dict[pos][0] + old_mino = final_dict[pos][1] + freq_majo = final_dict[pos][2] + freq_mino = final_dict[pos][3] + # Cas où la nouvelle fréquence alt est > à la freq majo déjà enregistrée + if freq_alt >= freq_majo: + iupac_to_test = "".join(sorted(old_majo + old_mino)) + if iupac_to_test not in iupac.keys(): + convert_iupac = list(iupac.keys())[list(iupac.values()).index(old_mino)] + iupac_to_test = "".join(sorted(old_majo + convert_iupac)) + final_dict[pos] = [alt, iupac[iupac_to_test], freq_alt, freq_majo + freq_mino, name_seq] + else: + iupac_to_test = "".join(sorted(alt + old_mino)) + if iupac_to_test not in iupac.keys(): + convert_iupac = list(iupac.keys())[list(iupac.values()).index(old_mino)] + iupac_to_test = "".join(sorted(alt + convert_iupac)) + freq_add = freq_alt + freq_mino + final_dict[pos] = [old_majo, iupac[iupac_to_test], freq_majo, freq_add, name_seq] + # Nouvelle position + else: + # definition colonne: + # final_dict [ Base majoritaire , Base minoritaire, Frequence B majo, Freq B mino] + if freq_ref >= freq_alt: + final_dict[pos] = [ref, alt, freq_ref, freq_alt, name_seq] + else: + final_dict[pos] = [alt, ref, freq_alt, freq_ref, name_seq] + return final_dict +# + + +# INPUT +def __main__(): + # define arguments + parser = argparse.ArgumentParser(description="""Create two reference file (at the same place of + the reference file), one with major and one with minor mutation, + from lofreq vcf file and original reference fasta file""", + epilog="""This script needs few options use -h to see it.""") + parser.add_argument("-v", "-vcf_file", dest="vcf_file", + help="Vcf file, generate by lofreq.") + parser.add_argument("-r", "-ref_file", dest="ref_file", + help="Reference fasta file used to create vcf file.") + parser.add_argument("-M", "-MAJOR_ref", dest="MAJOR_file", + help="Output MAJOR ref file with .fa or .fasta ext.") + parser.add_argument("-m", "-MINOR_ref", dest="MINOR_file", + help="Output MINOR ref file with .fa or .fasta ext.") + if len(sys.argv) < 9 or len(sys.argv) > 9: + parser.print_help() + sys.exit(1) + options = parser.parse_args() + vcf_file = options.vcf_file + ref_file = options.ref_file + major_file = options.MAJOR_file + minor_file = options.MINOR_file + + # OUTPUT + MAJORoutputfile = open(major_file, "w") + MINORoutputfile = open(minor_file, "w") + + # CORPUS + minor_ref = [] + major_ref = [] + + # Create new references + with open(ref_file, "r") as handle: + for record in SeqIO.parse(handle, "fasta"): + ide = record.id + major_seq = record.seq.tomutable() + minor_seq = record.seq.tomutable() + mut_dict = {} + mut_dict = CatchVCFMut(ide, vcf_file) + # print(mut_dict) + for mut in mut_dict: + # print(mut_dict[mut]) + if bool(re.search('[A-Z]', mut_dict[mut][0])) is False: + major_seq[int(mut)-1] = 'N' + else: + major_seq[int(mut)-1] = mut_dict[mut][0] + if bool(re.search('[A-Z]', mut_dict[mut][1])) is False: + minor_seq[int(mut)-1] = 'N' + else: + minor_seq[int(mut)-1] = mut_dict[mut][1] + major_ref.append(SeqRecord(major_seq, id=f"{ide}_MAJOR", description="")) + minor_ref.append(SeqRecord(minor_seq, id=f"{ide}_MINOR", description="")) + + SeqIO.write(minor_ref, MINORoutputfile, "fasta") + SeqIO.write(major_ref, MAJORoutputfile, "fasta") + + +if __name__ == "__main__": + __main__() +