Mercurial > repos > p.lucas > get_major_minor

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VCF_create_major_and_minor_reference.py	Tue Jun 11 10:05:47 2024 +0000
@@ -0,0 +1,136 @@
+#!/usr/python3
+# IMPORT
+import os
+import argparse
+import sys
+import re
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+from Bio.Data import IUPACData
+
+"""
+Get vcf file to create dict with major and minor mutation
+Get fasta file with reference sequence used to get the vcf file
+Create 2 reference files, one with all major mutation
+and one with minor mutation.
+"""
+
+
+# FUNCTION TO READ VCF FILE
+def CatchVCFMut(name_seq_ref, vcf_file):
+  final_dict = {}
+  # all iupac dict
+  iupac = {v: k for k, v in IUPACData.ambiguous_dna_values.items()}
+  # Store vcf info et get major and minor base for each SNP
+  for line in open(vcf_file):
+    li = line.strip()
+    if not li.startswith("#"):
+      # print(li)
+      name_seq = li.split("\t")[0]
+      if name_seq == name_seq_ref:
+        pos = str(li.split("\t")[1])
+        ref = li.split("\t")[3]
+        alt = li.split("\t")[4]
+        if not ref or not alt:
+          continue
+        if bool(re.search("[A-Z]", ref)) is False or bool(re.search("[A-Z]", alt)) is False:
+          print(li)
+          continue
+        depth = int(re.findall("DP=[0-9]+;", li)[0].split("=")[1].split(";")[0])
+        freq_alt = round(float(re.findall("AF=[0-9.]+;", li)[0].split("=")[1].split(";")[0])*100, 2)
+        counting_list = re.findall("DP4=.*$", li)[0].split("=")[1].split(",")
+        freq_ref = round((int(counting_list[0]) + int(counting_list[1]))*100/depth, 2)
+        # Cas où la position est déjà enregistree
+        if pos in final_dict.keys():
+          old_majo = final_dict[pos][0]
+          old_mino = final_dict[pos][1]
+          freq_majo = final_dict[pos][2]
+          freq_mino = final_dict[pos][3]
+          # Cas où la nouvelle fréquence alt est > à la freq majo déjà enregistrée
+          if freq_alt >= freq_majo:
+            iupac_to_test = "".join(sorted(old_majo + old_mino))
+            if iupac_to_test not in iupac.keys():
+              convert_iupac = list(iupac.keys())[list(iupac.values()).index(old_mino)]
+              iupac_to_test = "".join(sorted(old_majo + convert_iupac))
+            final_dict[pos] = [alt, iupac[iupac_to_test], freq_alt, freq_majo + freq_mino, name_seq]
+          else:
+            iupac_to_test = "".join(sorted(alt + old_mino))
+            if iupac_to_test not in iupac.keys():
+              convert_iupac = list(iupac.keys())[list(iupac.values()).index(old_mino)]
+              iupac_to_test = "".join(sorted(alt + convert_iupac))
+            freq_add = freq_alt + freq_mino
+            final_dict[pos] = [old_majo, iupac[iupac_to_test], freq_majo, freq_add, name_seq]
+        # Nouvelle position
+        else:
+          # definition colonne:
+          # final_dict [ Base majoritaire , Base minoritaire, Frequence B majo, Freq B mino]
+          if freq_ref >= freq_alt:
+            final_dict[pos] = [ref, alt, freq_ref, freq_alt, name_seq]
+          else:
+            final_dict[pos] = [alt, ref, freq_alt, freq_ref, name_seq]
+  return final_dict
+#
+
+
+# INPUT
+def __main__():
+  # define arguments
+  parser = argparse.ArgumentParser(description="""Create two reference file (at the same place of
+                                   the reference file), one with major and one with minor mutation,
+                                   from lofreq vcf file and original reference fasta file""",
+                                   epilog="""This script needs few options use -h to see it.""")
+  parser.add_argument("-v", "-vcf_file", dest="vcf_file",
+                      help="Vcf file, generate by lofreq.")
+  parser.add_argument("-r", "-ref_file", dest="ref_file",
+                      help="Reference fasta file used to create vcf file.")
+  parser.add_argument("-M", "-MAJOR_ref", dest="MAJOR_file",
+                      help="Output MAJOR ref file with .fa or .fasta ext.")
+  parser.add_argument("-m", "-MINOR_ref", dest="MINOR_file",
+                      help="Output MINOR ref file with .fa or .fasta ext.")
+  if len(sys.argv) < 9 or len(sys.argv) > 9:
+    parser.print_help()
+    sys.exit(1)
+  options = parser.parse_args()
+  vcf_file = options.vcf_file
+  ref_file = options.ref_file
+  major_file = options.MAJOR_file
+  minor_file = options.MINOR_file
+
+  # OUTPUT
+  MAJORoutputfile = open(major_file, "w")
+  MINORoutputfile = open(minor_file, "w")
+
+  # CORPUS
+  minor_ref = []
+  major_ref = []
+
+  # Create new references
+  with open(ref_file, "r") as handle:
+    for record in SeqIO.parse(handle, "fasta"):
+      ide = record.id
+      major_seq = record.seq.tomutable()
+      minor_seq = record.seq.tomutable()
+      mut_dict = {}
+      mut_dict = CatchVCFMut(ide, vcf_file)
+      # print(mut_dict)
+      for mut in mut_dict:
+        # print(mut_dict[mut])
+        if bool(re.search('[A-Z]', mut_dict[mut][0])) is False:
+          major_seq[int(mut)-1] = 'N'
+        else:
+          major_seq[int(mut)-1] = mut_dict[mut][0]
+        if bool(re.search('[A-Z]', mut_dict[mut][1])) is False:
+          minor_seq[int(mut)-1] = 'N'
+        else:
+          minor_seq[int(mut)-1] = mut_dict[mut][1]
+      major_ref.append(SeqRecord(major_seq, id=f"{ide}_MAJOR", description=""))
+      minor_ref.append(SeqRecord(minor_seq, id=f"{ide}_MINOR", description=""))
+
+  SeqIO.write(minor_ref, MINORoutputfile, "fasta")
+  SeqIO.write(major_ref, MAJORoutputfile, "fasta")
+
+
+if __name__ == "__main__":
+  __main__()
+