lumpy_smoove: vcf2hrdetect.py comparison

comparison vcf2hrdetect.py @ 11:5a326a6fa105 draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/lumpy_smoove commit 8b10e8fc832f8ca7c32479e20d5edbd62088a3aa

author	artbio
date	Fri, 17 Oct 2025 17:21:17 +0000
parents	7dcf61950215
children

comparison

equal deleted inserted replaced

-:8711df965d4b
+:5a326a6fa105
+#!/usr/bin/env python
+import argparse
+import re
 import sys
-handle = open(sys.argv[1], 'r')
-vcfdict = dict()
+def create_arg_parser():
-tabdict = dict()
+"""Creates and returns the argument parser."""
-for line in handle:
+parser = argparse.ArgumentParser(
-if line[0] == "#":
+description=(
-continue
+"Convert a VCF file from lumpy-smoove to a tabular format "
-else:
+"compatible with the HRDetect pipeline."
-tabline = line[:-1].split("\t")
+)
-vcfdict[tabline[2]] = tabline
+)
-for id in vcfdict.keys():
+parser.add_argument(
-if "_1" in id:
+'vcf_file',
-newid = id[:-2]
+help='Path to the input VCF file.'
-pointbreak = vcfdict[id][4]
+)
-if "]" in pointbreak:
+parser.add_argument(
-coordbreak = pointbreak.split("]")[1].split(":")[1]
+'output_file',
-chrom = pointbreak.split("]")[1].split(":")[0]
+help='Path to the output tabular file.'
-elif "[" in pointbreak:
+)
-coordbreak = pointbreak.split("[")[1].split(":")[1]
+return parser
-chrom = pointbreak.split("[")[1].split(":")[0]
-if vcfdict[id][0] == chrom:
-tabdict[newid] = [chrom, vcfdict[id][1], chrom, coordbreak, "INV"]
+def parse_breakend_alt(alt_field):
-else:
+"""
-tabdict[newid] = [vcfdict[id][0], vcfdict[id][1],
+Parses the ALT field for a breakend and returns chromosome and position.
-chrom, coordbreak, "TRA"]
-for id in list(vcfdict):
+Args:
-if "_" in id:
+alt_field (str): The ALT field (column 5) of a VCF line.
-del vcfdict[id]
-for id in vcfdict.keys():  # only sv that are not of type TRA or INV
+Returns:
-chr1 = vcfdict[id][0]
+tuple: A tuple containing (chromosome, position) or (None, None)
-chr2 = vcfdict[id][0]
+if parsing fails.
-pos1 = vcfdict[id][1]
+"""
-pos2 = vcfdict[id][7].split("END=")[1].split(";")[0]
+# Search for patterns ]chr:pos] or [chr:pos[
-type = vcfdict[id][7].split("SVTYPE=")[1].split(";")[0]
+pattern = (
-tabdict[id] = [chr1, pos1, chr2, pos2, type]
+r"\](?P<chrom1>[^:]+):(?P<pos1>\d+)\]|"
-out = open(sys.argv[2], 'w')
+r"\[(?P<chrom2>[^:]+):(?P<pos2>\d+)\["
-out.write("chr1\tpos1\tchr2\tpos2\ttype\n")
+)
-for key in tabdict:
+match = re.search(pattern, alt_field)
-line = "\t".join(tabdict[key]) + "\n"
-out.write(line)
+if not match:
+return None, None
+groups = match.groupdict()
+chrom = groups['chrom1'] or groups['chrom2']
+pos = groups['pos1'] or groups['pos2']
+return chrom, pos
+def process_vcf(vcf_path, output_path):
+"""
+Reads a VCF file, converts it, and writes the result to a tabular file.
+Args:
+vcf_path (str): Path to the input VCF file.
+output_path (str): Path to the output tabular file.
+"""
+header = ["chr1", "pos1", "chr2", "pos2", "type"]
+try:
+with open(vcf_path, 'r') as infile, open(output_path, 'w') as outfile:
+outfile.write("\t".join(header) + "\n")
+for line in infile:
+if line.startswith('#'):
+continue
+fields = line.strip().split('\t')
+if len(fields) < 8:
+continue
+chrom1 = fields[0]
+pos1 = fields[1]
+info = fields[7]
+# Attempt to extract the structural variant type from the info
+svtype_match = re.search(r'SVTYPE=([^;]+)', info)
+if not svtype_match:
+continue  # Skip lines without SVTYPE tag
+svtype = svtype_match.group(1)
+if svtype == "BND":  # Breakend (INV or TRA)
+alt_field = fields[4]
+chrom2, pos2 = parse_breakend_alt(alt_field)
+if not (chrom2 and pos2):
+continue
+event_type = "INV" if chrom1 == chrom2 else "TRA"
+row = [chrom1, pos1, chrom2, pos2, event_type]
+outfile.write("\t".join(row) + "\n")
+else:  # Other SV types (DEL, DUP, etc.)
+end_match = re.search(r'END=([^;]+)', info)
+if not end_match:
+continue
+pos2 = end_match.group(1)
+chrom2 = chrom1
+row = [chrom1, pos1, chrom2, pos2, svtype]
+outfile.write("\t".join(row) + "\n")
+except FileNotFoundError:
+print(f"Error: File '{vcf_path}' not found.",
+file=sys.stderr)
+sys.exit(1)
+except IOError as e:
+print(f"IO Error: {e}", file=sys.stderr)
+sys.exit(1)
+def main():
+"""Main function of the script."""
+parser = create_arg_parser()
+args = parser.parse_args()
+process_vcf(args.vcf_file, args.output_file)
+if __name__ == '__main__':
+main()

Mercurial > repos > artbio > lumpy_smoove

comparison vcf2hrdetect.py @ 11:5a326a6fa105 draft default tip