Mercurial > repos > artbio > lumpy_sv
changeset 0:796552c157de draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit d06124e8a097f3f665b4955281f40fe811eaee64
author | artbio |
---|---|
date | Mon, 24 Jul 2017 08:03:17 -0400 |
parents | |
children | 1ed8619a5611 |
files | extractSplitReads_BwaMem.py lumpy.xml pairend_distro.py test-data/output.vcf test-data/output_extended.vcf test-data/output_two.vcf test-data/sr.input.bam |
diffstat | 7 files changed, 775 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extractSplitReads_BwaMem.py Mon Jul 24 08:03:17 2017 -0400 @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +import sys +import getopt +import string +from optparse import OptionParser +import re + +def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap): + if inFile == "stdin": + data = sys.stdin + else: + data = open(inFile, 'r') + for line in data: + split = 0 + if line[0] == '@': + print line.strip() + continue + samList = line.strip().split('\t') + sam = SAM(samList) + if includeDups==0 and (1024 & sam.flag)==1024: + continue + for el in sam.tags: + if "SA:" in el: + if(len(el.split(";")))<=numSplits: + split = 1 + mate = el.split(",") + mateCigar = mate[3] + mateFlag = int(0) + if mate[2]=="-": mateFlag = int(16) + if split: + read1 = sam.flag & 64 + if read1 == 64: tag = "_1" + else: tag="_2" + samList[0] = sam.query + tag + readCigar = sam.cigar + readCigarOps = extractCigarOps(readCigar,sam.flag) + readQueryPos = calcQueryPosFromCigar(readCigarOps) + mateCigarOps = extractCigarOps(mateCigar,mateFlag) + mateQueryPos = calcQueryPosFromCigar(mateCigarOps) + overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos) + nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap + nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap + mno = min(nonOverlap1, nonOverlap2) + if mno >= minNonOverlap: + print "\t".join(samList) + +#-------------------------------------------------------------------------------------------------- +# functions +#-------------------------------------------------------------------------------------------------- + +class SAM (object): + """ + __very__ basic class for SAM input. + """ + def __init__(self, samList = []): + if len(samList) > 0: + self.query = samList[0] + self.flag = int(samList[1]) + self.ref = samList[2] + self.pos = int(samList[3]) + self.mapq = int(samList[4]) + self.cigar = samList[5] + self.matRef = samList[6] + self.matePos = int(samList[7]) + self.iSize = int(samList[8]) + self.seq = samList[9] + self.qual = samList[10] + self.tags = samList[11:]#tags is a list of each tag:vtype:value sets + self.valid = 1 + else: + self.valid = 0 + self.query = 'null' + + def extractTagValue (self, tagID): + for tag in self.tags: + tagParts = tag.split(':', 2); + if (tagParts[0] == tagID): + if (tagParts[1] == 'i'): + return int(tagParts[2]); + elif (tagParts[1] == 'H'): + return int(tagParts[2],16); + return tagParts[2]; + return None; + +#----------------------------------------------- +cigarPattern = '([0-9]+[MIDNSHP])' +cigarSearch = re.compile(cigarPattern) +atomicCigarPattern = '([0-9]+)([MIDNSHP])' +atomicCigarSearch = re.compile(atomicCigarPattern) + +def extractCigarOps(cigar,flag): + if (cigar == "*"): + cigarOps = [] + elif (flag & 0x0010): + cigarOpStrings = cigarSearch.findall(cigar) + cigarOps = [] + for opString in cigarOpStrings: + cigarOpList = atomicCigarSearch.findall(opString) +# print cigarOpList + # "struct" for the op and it's length + cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) + # add to the list of cigarOps + cigarOps.append(cigar) + cigarOps = cigarOps + cigarOps.reverse() + ##do in reverse order because negative strand## + else: + cigarOpStrings = cigarSearch.findall(cigar) + cigarOps = [] + for opString in cigarOpStrings: + cigarOpList = atomicCigarSearch.findall(opString) + # "struct" for the op and it's length + cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) + # add to the list of cigarOps + cigarOps.append(cigar) +# cigarOps = cigarOps + return(cigarOps) + +def calcQueryPosFromCigar(cigarOps): + qsPos = 0 + qePos = 0 + qLen = 0 + # if first op is a H, need to shift start position + # the opPosition counter sees if the for loop is looking at the first index of the cigar object + opPosition = 0 + for cigar in cigarOps: + if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): + qsPos += cigar.length + qePos += cigar.length + qLen += cigar.length + elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): + qLen += cigar.length + elif cigar.op == 'M' or cigar.op == 'I': + qePos += cigar.length + qLen += cigar.length + opPosition += 1 + d = queryPos(qsPos, qePos, qLen); + return d + +class cigarOp (object): + """ + sturct to store a discrete CIGAR operations + """ + def __init__(self, opLength, op): + self.length = int(opLength) + self.op = op + +class queryPos (object): + """ + struct to store the start and end positions of query CIGAR operations + """ + def __init__(self, qsPos, qePos, qLen): + self.qsPos = int(qsPos) + self.qePos = int(qePos) + self.qLen = int(qLen) + + +def calcQueryOverlap(s1,e1,s2,e2): + o = 1 + min(e1, e2) - max(s1, s2) + return max(0, o) + +############################################### + +class Usage(Exception): + def __init__(self, msg): + self.msg = msg + +def main(): + + usage = """%prog -i <file> + +extractSplitReads_BwaMem v0.1.0 +Author: Ira Hall +Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates. +Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. + """ + parser = OptionParser(usage) + + parser.add_option("-i", "--inFile", dest="inFile", + help="A SAM file or standard input (-i stdin).", + metavar="FILE") + parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int", + help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2", + metavar="INT") + parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0, + help="Include alignments marked as duplicates. Default=False") + parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int", + help="minimum non-overlap between split alignments on the query (default=20)", + metavar="INT") + (opts, args) = parser.parse_args() + if opts.inFile is None: + parser.print_help() + print + else: + try: + extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap) + except IOError as err: + sys.stderr.write("IOError " + str(err) + "\n"); + return +if __name__ == "__main__": + sys.exit(main())
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lumpy.xml Mon Jul 24 08:03:17 2017 -0400 @@ -0,0 +1,284 @@ +<tool id="lumpy" name="lumpy-sv" version="1.0.0"> + <description>find structural variants</description> + <requirements> + <requirement type="package" version="0.2.13">lumpy-sv</requirement> + <requirement type="package" version="1.3.1">samtools</requirement> + <requirement type="package" version="1.11.2=py27_0">numpy</requirement> + </requirements> + <stdio> + <exit_code range="1:" level="fatal" description="Tool exception" /> + </stdio> + <command detect_errors="exit_code"><![CDATA[ + #import re + #set one_sample_bam = re.sub('[^\w\-]', '_', str($analysis_type.input_file.element_identifier)) + #if $analysis_type.analysis_type_list == "one_sample": + ln -f -s '$analysis_type.input_file' '$one_sample_bam' && + #else: + #set sample_a_bam = re.sub('[^\w\-]', '_', str($analysis_type.input_file.element_identifier)) + #set sample_b_bam = re.sub('[^\w\-]', '_', str($analysis_type.input_fileB.element_identifier)) + #if $sample_a_bam == $sample_b_bam: + #set sample_a_bam = "%s_a" % str($sample_a_bam) + #set sample_b_bam = "%s_b" % str($sample_b_bam) + #end if + ln -f -s '$analysis_type.input_file' '$sample_a_bam' && + ln -f -s '$analysis_type.input_fileB' '$sample_b_bam' && + #end if + + #if $analysis_type.analysis_type_list == "one_sample": + + #if $seq_method.seq_method_list == "paired-end": + samtools view -u -F 1294 '$one_sample_bam' | samtools sort -O bam -o input.discordants.bam && + samtools view -h '$one_sample_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.splitters.bam && + samtools view '$one_sample_bam' + |python $__tool_directory__/pairend_distro.py -r $analysis_type.readLength -X 4 -N $seq_method.additional_params.samplingValue -o input.lib.histo > meandev.txt && + mean=\$(cat meandev.txt | sed s/mean:// | sed -r s/stdev:.+//) && + stdev=\$(cat meandev.txt | sed -r s/mean:.+stdev://) && + lumpy $seq_method.additional_params.evidence $seq_method.additional_params.probability_curve -mw $seq_method.additional_params.mw -tt $seq_method.additional_params.tt + #if $output_format == "BEDPE": + -b + #end if + -pe id:'$one_sample_bam',bam_file:input.discordants.bam,histo_file:input.lib.histo,mean:"\$mean",stdev:"\$stdev",read_length:$analysis_type.readLength,min_non_overlap:$seq_method.additional_params.min_non_overlap,discordant_z:$seq_method.additional_params.discordant_z,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold + -sr id:'$one_sample_bam',bam_file:input.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold > '$vcf_call' + #elif $seq_method.seq_method_list == "single-read": + samtools view -h '$one_sample_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.splitters.bam && + lumpy $seq_method.additional_params.evidence $seq_method.additional_params.probability_curve -mw $seq_method.additional_params.mw -tt $seq_method.additional_params.tt + #if $output_format == "BEDPE": + -b + #end if + -sr id:'$one_sample_bam',bam_file:input.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold > '$vcf_call' + #end if + #else: + #if $seq_method.seq_method_list == "paired-end": + samtools view -u -F 1294 '$sample_a_bam' | samtools sort -O bam -o input.discordants.bam && + samtools view -u -F 1294 '$sample_b_bam' | samtools sort -O bam -o input.B.discordants.bam && + samtools view -h '$sample_a_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.splitters.bam && + samtools view -h '$sample_b_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.B.splitters.bam && + samtools view '$sample_a_bam' + |python $__tool_directory__/pairend_distro.py -r $analysis_type.readLength -X 4 -N $seq_method.additional_params.samplingValue -o input.lib.histo > meandevA.txt && + samtools view '$sample_b_bam' + |python $__tool_directory__/pairend_distro.py -r $analysis_type.readLengthB -X 4 -N $seq_method.additional_params.samplingValue -o input.B.lib.histo > meandevB.txt && + meanA=\$(cat meandevA.txt | sed s/mean:// | sed -r s/stdev:.+//) && + meanB=\$(cat meandevB.txt | sed s/mean:// | sed -r s/stdev:.+//) && + stdevA=\$(cat meandevA.txt | sed -r s/mean:.+stdev://) && + stdevB=\$(cat meandevB.txt | sed -r s/mean:.+stdev://) && + lumpy $seq_method.additional_params.evidence $seq_method.additional_params.probability_curve -mw $seq_method.additional_params.mw -tt $seq_method.additional_params.tt + #if $output_format == "BEDPE": + -b + #end if + -pe id:inputA.bam,bam_file:input.discordants.bam,histo_file:input.lib.histo,mean:"\$meanA",stdev:"\$stdevA",read_length:$analysis_type.readLength,min_non_overlap:$seq_method.additional_params.min_non_overlap,discordant_z:$seq_method.additional_params.discordant_z,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold + -pe id:inputB.bam,bam_file:input.B.discordants.bam,histo_file:input.B.lib.histo,mean:"\$meanB",stdev:"\$stdevA",read_length:$analysis_type.readLengthB,min_non_overlap:$seq_method.additional_params.min_non_overlap,discordant_z:$seq_method.additional_params.discordant_z,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold + -sr id:inputA.bam,bam_file:input.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold + -sr id:inputB.bam,bam_file:input.B.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold > '$vcf_call' + #elif $seq_method.seq_method_list == "single-read": + samtools view -h '$sample_a_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.splitters.bam && + samtools view -h '$sample_b_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.B.splitters.bam && + lumpy $seq_method.additional_params.evidence $seq_method.additional_params.probability_curve -mw $seq_method.additional_params.mw -tt $seq_method.additional_params.tt + #if $output_format == "BEDPE": + -b + #end if + -sr id:'$sample_a_bam',bam_file:input.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold + -sr id:'$sample_b_bam',bam_file:input.B.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold > '$vcf_call' + #end if + #end if + + ]]></command> + <!-- basic error handling --> + <inputs> + <conditional name="analysis_type"> + <param help="Single or paired conditions (eg tumor vs normal)" label="Input(s)" name="analysis_type_list" type="select"> + <option selected="True" value="one_sample">One Sample</option> + <option value="two_sample">Two samples</option> + </param> + <when value="one_sample"> + <param format="bam" name="input_file" type="data" label="One BAM alignment file produced by BWA-mem"/> + <param name="readLength" value="151" type="integer" label="read length" help="e.g. 151 nt" /> + </when> + <when value="two_sample"> + <param format="bam" name="input_file" type="data" label="One BAM alignment file produced by BWA-mem"/> + <param name="readLength" value="151" type="integer" label="read length" help="e.g. 151 nt" /> + <param format="bam" name="input_fileB" type="data" label="One BAM alignment file produced by BWA-mem"/> + <param name="readLengthB" value="151" type="integer" label="read length" help="e.g. 151 nt" /> + </when> + </conditional> + <conditional name="seq_method"> + <param help="Paired-end or single-read sequencing" label="Sequencing method" name="seq_method_list" type="select"> + <option selected="True" value="paired-end">Paired-end sequencing</option> + <option value="single-read">Single-read sequencing</option> + </param> + <when value="paired-end"> + <section name="additional_params" title="Additional Options" expanded="False"> + <param name="samplingValue" value="100000" type="integer" label="number of reads to compute mean and stdev of read length" help="e.g. 10000" /> + <param name="mw" value="4" type="integer" label="-mw" help="minimum weight across all samples for a call (default: 4)" /> + <param name="tt" value="0" type="integer" label="-tt" help="trim threshold (default: 0)" /> + <param name="min_non_overlap" value="101" type="integer" label="min_non_overlap" help="e.g. 101" /> + <param name="discordant_z" value="5" type="integer" label="discordant_z" help="e.g. 5" /> + <param name="back_distance" value="10" type="integer" label="back_distance" help="e.g. 10" /> + <param name="weight" value="1" type="integer" label="weight" help="e.g. 1" /> + <param name="min_mapping_threshold" value="20" type="integer" label="min_mapping_threshold" help="e.g. 20" /> + <param name="probability_curve" argument="-P" type="boolean" truevalue="-P" falsevalue="" checked="true" label="output probability curve for each variant"/> + <param name="evidence" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="true" label="show evidence for each call"/> + </section> + </when> + <when value="single-read"> + <section name="additional_params" title="Additional Options" expanded="False"> + <param name="mw" value="4" type="integer" label="-mw" help="minimum weight across all samples for a call (default: 4)" /> + <param name="tt" value="0" type="integer" label="-tt" help="trim threshold (default: 0)" /> + <param name="back_distance" value="10" type="integer" label="back_distance" help="e.g. 10" /> + <param name="weight" value="1" type="integer" label="weight" help="e.g. 1" /> + <param name="min_mapping_threshold" value="20" type="integer" label="min_mapping_threshold" help="e.g. 20" /> + <param name="probability_curve" argument="-P" type="boolean" truevalue="-P" falsevalue="" checked="false" label="output probability curve for each variant"/> + <param name="evidence" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="false" label="show evidence for each call"/> + </section> + </when> + + </conditional> + <param help="get variant calling in vcf or BEDPE format" label="variant calling format" name="output_format" type="select"> + <option selected="True" value="vcf">vcf</option> + <option value="BEDPE">BEDPE</option> + </param> + </inputs> + + <outputs> + <data format="tabular" name="histogram" label="Lumpy on ${on_string}: Fragment size distribution" from_work_dir="input.lib.histo"> + <filter>seq_method['seq_method_list'] == "paired-end"</filter> + </data> + <data format="tabular" name="histogramB" label="Lumpy on ${on_string}: Fragment size distribution" from_work_dir="input.B.lib.histo"> + <filter>seq_method['seq_method_list'] == "paired-end"</filter> + <filter>analysis_type['analysis_type_list'] == "two_sample"</filter> + </data> + <data format="bam" name="splits" label="Lumpy on ${on_string}: Split Reads (Bam format)" from_work_dir="input.splitters.bam"/> + <data format="bam" name="splitsB" label="Lumpy on ${on_string}: Split Reads (Bam format)" from_work_dir="input.B.splitters.bam"> + <filter>analysis_type['analysis_type_list'] == "two_sample"</filter> + </data> + <data format="bam" name="discordants" label="Lumpy on ${on_string}: Discordant Pairs (Bam format)" from_work_dir="input.discordants.bam"> + <filter>seq_method['seq_method_list'] == "paired-end"</filter> + </data> + <data format="bam" name="discordantsB" label="Lumpy on ${on_string}: Discordant Pairs (Bam format)" from_work_dir="input.discordants.B.bam"> + <filter>seq_method['seq_method_list'] == "paired-end"</filter> + <filter>analysis_type['analysis_type_list'] == "two_sample"</filter> + </data> + <data format="vcf" name="vcf_call" label="Lumpy Variant Calling"> + <change_format> + <when format="tabular" input="output_format" value="BEDPE" /> + </change_format> + </data> + </outputs> + + <tests> + <test> + <param name="analysis_type_list" value="one_sample" /> + <param name="input_file" value="sr.input.bam" ftype="bam"/> + <param name="seq_method_list" value="single-read" /> + <param name="mw" value="4"/> + <param name="tt" value="0"/> + <param name="back_distance" value="10"/> + <param name="weight" value="1" /> + <param name="min_mapping_threshold" value="20" /> + <output name="vcf_call" file="output.vcf" ftype="vcf"/> + </test> + <test> + <param name="analysis_type_list" value="one_sample" /> + <param name="input_file" value="sr.input.bam" ftype="bam"/> + <param name="seq_method_list" value="single-read" /> + <param name="mw" value="4"/> + <param name="tt" value="0"/> + <param name="back_distance" value="10"/> + <param name="weight" value="1" /> + <param name="min_mapping_threshold" value="20" /> + <param name="evidence" value="true" /> + <param name="probability_curve" value="true" /> + <output name="vcf_call" file="output_extended.vcf" ftype="vcf" compare="sim_size"/> + </test> + <test> + <param name="analysis_type_list" value="two_sample" /> + <param name="input_file" value="sr.input.bam" ftype="bam"/> + <param name="input_fileB" value="sr.input.bam" ftype="bam"/> + <param name="seq_method_list" value="single-read" /> + <param name="mw" value="4"/> + <param name="tt" value="0"/> + <param name="back_distance" value="10"/> + <param name="weight" value="1" /> + <param name="min_mapping_threshold" value="20" /> + <output name="vcf_call" file="output_two.vcf" ftype="vcf"/> + </test> + </tests> + + <help> + +**Input(s)** + +*One sample* : lumpy search structural variations inside a single sequencing dataset + +*Two samples*: lumpy search structural variations inside and across two sequencing datasets from two samples + +Analysis of sample replicates is not implemented yet in this wrapper + +*BAM files*: Only BAM alignments produced by BWA-mem have been tested with this tool + +**Sequencing method** + +*Paired-end sequencing*: Both ends of library fragments have been sequenced, resulting in two paired sequencing datasets + +*Single-read sequencing*: Only one end of library fragment has been sequenced, resulting in a single sequencing dataset. Under these conditions, evidences of structural variation are obtained only from splited read alignments + +*Read length*: The length of the sequencing reads in the library. This information is required only for paired-end sequencing data + +*Additional options*: refer to lumpy-sv_ documentation and the publication (doi 10.1186/gb-2014-15-6-r84) + +**lumpy-sv manual** + +Read the lumpy-sv_ documentation for details on using lumpy. + +.. _lumpy-sv: https://github.com/arq5x/lumpy-sv + +**lumpy options** + +v 0.2.13 +Author: Ryan Layer (rl6sf@virginia.edu) + +Summary: Find structural variations in various signals. + +Options:: +<![CDATA[ + + -g Genome file (defines chromosome order) + -e Show evidence for each call + -w File read windows size (default 1000000) + -mw minimum weight for a call + -msw minimum per-sample weight for a call + -tt trim threshold + -x exclude file bed file + -t temp file prefix, must be to a writeable directory + -P output probability curve for each variant + -b output BEDPE instead of VCF + -sr bam_file:<file name>, + id:<sample name>, + back_distance:<distance>, + min_mapping_threshold:<mapping quality>, + weight:<sample weight>, + min_clip:<minimum clip length>, + read_group:<string> + + -pe bam_file:<file name>, + id:<sample name>, + histo_file:<file name>, + mean:<value>, + stdev:<value>, + read_length:<length>, + min_non_overlap:<length>, + discordant_z:<z value>, + back_distance:<distance>, + min_mapping_threshold:<mapping quality>, + weight:<sample weight>, + read_group:<string> + + -bedpe bedpe_file:<bedpe file>, + id:<sample name>, + weight:<sample weight> +]]> + </help> + + <citations> + <citation type="doi">10.1186/gb-2014-15-6-r84</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pairend_distro.py Mon Jul 24 08:03:17 2017 -0400 @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# (c) 2012 - Ryan M. Layer +# Hall Laboratory +# Quinlan Laboratory +# Department of Computer Science +# Department of Biochemistry and Molecular Genetics +# Department of Public Health Sciences and Center for Public Health Genomics, +# University of Virginia +# rl6sf@virginia.edu + +import sys +import numpy as np +from operator import itemgetter +from optparse import OptionParser + +# some constants for sam/bam field ids +SAM_FLAG = 1 +SAM_REFNAME = 2 +SAM_MATE_REFNAME = 6 +SAM_ISIZE = 8 + +parser = OptionParser() + +parser.add_option("-r", + "--read_length", + type="int", + dest="read_length", + help="Read length") + +parser.add_option("-X", + dest="X", + type="int", + help="Number of stdevs from mean to extend") + +parser.add_option("-N", + dest="N", + type="int", + help="Number to sample") + +parser.add_option("-o", + dest="output_file", + help="Output file") + +parser.add_option("-m", + dest="mads", + type="int", + default=10, + help="Outlier cutoff in # of median absolute deviations (unscaled, upper only)") + +def unscaled_upper_mad(xs): + """Return a tuple consisting of the median of xs followed by the + unscaled median absolute deviation of the values in xs that lie + above the median. + """ + med = np.median(xs) + return med, np.median(xs[xs > med] - med) + + +(options, args) = parser.parse_args() + +if not options.read_length: + parser.error('Read length not given') + +if not options.X: + parser.error('X not given') + +if not options.N: + parser.error('N not given') + +if not options.output_file: + parser.error('Output file not given') + + +required = 97 +restricted = 3484 +flag_mask = required | restricted + +L = [] +c = 0 + +for l in sys.stdin: + if c >= options.N: + break + + A = l.rstrip().split('\t') + flag = int(A[SAM_FLAG]) + refname = A[SAM_REFNAME] + mate_refname = A[SAM_MATE_REFNAME] + isize = int(A[SAM_ISIZE]) + + want = mate_refname == "=" and flag & flag_mask == required and isize >= 0 + if want: + c += 1 + L.append(isize) + +# warn if very few elements in distribution +min_elements = 1000 +if len(L) < min_elements: + sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % (len(L), min_elements)) + mean = "NA" + stdev = "NA" + +else: + # Remove outliers + L = np.array(L) + L.sort() + med, umad = unscaled_upper_mad(L) + upper_cutoff = med + options.mads * umad + L = L[L < upper_cutoff] + new_len = len(L) + removed = c - new_len + sys.stderr.write("Removed %d outliers with isize >= %d\n" % + (removed, upper_cutoff)) + c = new_len + + mean = np.mean(L) + stdev = np.std(L) + + start = options.read_length + end = int(mean + options.X*stdev) + + H = [0] * (end - start + 1) + s = 0 + + for x in L: + if (x >= start) and (x <= end): + j = int(x - start) + H[j] = H[ int(x - start) ] + 1 + s += 1 + + f = open(options.output_file, 'w') + + for i in range(end - start): + o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n" + f.write(o) + + + f.close() + +print('mean:' + str(mean) + '\tstdev:' + str(stdev))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.vcf Mon Jul 24 08:03:17 2017 -0400 @@ -0,0 +1,37 @@ +##fileformat=VCFv4.2 +##source=LUMPY +##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> +##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> +##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> +##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)"> +##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation"> +##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> +##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> +##INFO=<ID=CIPOS95,Number=2,Type=Integer,Description="Confidence interval (95%) around POS for imprecise variants"> +##INFO=<ID=CIEND95,Number=2,Type=Integer,Description="Confidence interval (95%) around END for imprecise variants"> +##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends"> +##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend"> +##INFO=<ID=SECONDARY,Number=0,Type=Flag,Description="Secondary breakend in a multi-line variants"> +##INFO=<ID=SU,Number=.,Type=Integer,Description="Number of pieces of evidence supporting the variant across all samples"> +##INFO=<ID=PE,Number=.,Type=Integer,Description="Number of paired-end reads supporting the variant across all samples"> +##INFO=<ID=SR,Number=.,Type=Integer,Description="Number of split reads supporting the variant across all samples"> +##INFO=<ID=BD,Number=.,Type=Integer,Description="Amount of BED evidence supporting the variant across all samples"> +##INFO=<ID=EV,Number=.,Type=String,Description="Type of LUMPY evidence contributing to the variant call"> +##INFO=<ID=PRPOS,Number=.,Type=String,Description="LUMPY probability curve of the POS breakend"> +##INFO=<ID=PREND,Number=.,Type=String,Description="LUMPY probability curve of the END breakend"> +##ALT=<ID=DEL,Description="Deletion"> +##ALT=<ID=DUP,Description="Duplication"> +##ALT=<ID=INV,Description="Inversion"> +##ALT=<ID=DUP:TANDEM,Description="Tandem duplication"> +##ALT=<ID=INS,Description="Insertion of novel sequence"> +##ALT=<ID=CNV,Description="Copy number variable region"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant"> +##FORMAT=<ID=PE,Number=1,Type=Integer,Description="Number of paired-end reads supporting the variant"> +##FORMAT=<ID=SR,Number=1,Type=Integer,Description="Number of split reads supporting the variant"> +##FORMAT=<ID=BD,Number=1,Type=Integer,Description="Amount of BED evidence supporting the variant"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sr_input_bam +hg38_gold_U07000.1 14 1_1 N [hg38_gold_U07000.1:1876[N . . SVTYPE=BND;STRANDS=--:19;EVENT=1;MATEID=1_2;CIPOS=0,0;CIEND=0,2;CIPOS95=0,0;CIEND95=0,0;SU=19;SR=19 GT:SU:SR ./.:19:19 +hg38_gold_U07000.1 1876 1_2 N [hg38_gold_U07000.1:14[N . . SVTYPE=BND;STRANDS=--:19;SECONDARY;EVENT=1;MATEID=1_1;CIPOS=0,2;CIEND=0,0;CIPOS95=0,0;CIEND95=0,0;SU=19;SR=19 GT:SU:SR ./.:19:19 +hg38_gold_U07000.1 10 2_1 N [hg38_gold_U07000.1:1897[N . . SVTYPE=BND;STRANDS=--:19;EVENT=2;MATEID=2_2;CIPOS=-1,0;CIEND=-7,5;CIPOS95=0,1;CIEND95=-2,1;IMPRECISE;SU=19;SR=19 GT:SU:SR ./.:19:19 +hg38_gold_U07000.1 1897 2_2 N [hg38_gold_U07000.1:10[N . . SVTYPE=BND;STRANDS=--:19;SECONDARY;EVENT=2;MATEID=2_1;CIPOS=-7,5;CIEND=-1,0;CIPOS95=-2,1;CIEND95=0,1;IMPRECISE;SU=19;SR=19 GT:SU:SR ./.:19:19
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_extended.vcf Mon Jul 24 08:03:17 2017 -0400 @@ -0,0 +1,75 @@ +##fileformat=VCFv4.2 +##source=LUMPY +##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> +##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> +##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> +##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)"> +##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation"> +##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> +##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> +##INFO=<ID=CIPOS95,Number=2,Type=Integer,Description="Confidence interval (95%) around POS for imprecise variants"> +##INFO=<ID=CIEND95,Number=2,Type=Integer,Description="Confidence interval (95%) around END for imprecise variants"> +##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends"> +##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend"> +##INFO=<ID=SECONDARY,Number=0,Type=Flag,Description="Secondary breakend in a multi-line variants"> +##INFO=<ID=SU,Number=.,Type=Integer,Description="Number of pieces of evidence supporting the variant across all samples"> +##INFO=<ID=PE,Number=.,Type=Integer,Description="Number of paired-end reads supporting the variant across all samples"> +##INFO=<ID=SR,Number=.,Type=Integer,Description="Number of split reads supporting the variant across all samples"> +##INFO=<ID=BD,Number=.,Type=Integer,Description="Amount of BED evidence supporting the variant across all samples"> +##INFO=<ID=EV,Number=.,Type=String,Description="Type of LUMPY evidence contributing to the variant call"> +##INFO=<ID=PRPOS,Number=.,Type=String,Description="LUMPY probability curve of the POS breakend"> +##INFO=<ID=PREND,Number=.,Type=String,Description="LUMPY probability curve of the END breakend"> +##ALT=<ID=DEL,Description="Deletion"> +##ALT=<ID=DUP,Description="Duplication"> +##ALT=<ID=INV,Description="Inversion"> +##ALT=<ID=DUP:TANDEM,Description="Tandem duplication"> +##ALT=<ID=INS,Description="Insertion of novel sequence"> +##ALT=<ID=CNV,Description="Copy number variable region"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant"> +##FORMAT=<ID=PE,Number=1,Type=Integer,Description="Number of paired-end reads supporting the variant"> +##FORMAT=<ID=SR,Number=1,Type=Integer,Description="Number of split reads supporting the variant"> +##FORMAT=<ID=BD,Number=1,Type=Integer,Description="Amount of BED evidence supporting the variant"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sr_input_bam + Evidence: M00860:26:000000000-A6UGV:1:1101:22421:6659_2 hg38_gold_U07000.1 8 50 hg38_gold_U07000.1 1885 1932 0x17456f0 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:21503:6234_2 hg38_gold_U07000.1 13 52 hg38_gold_U07000.1 1885 1933 0x17451d0 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:14108:4338_2 hg38_gold_U07000.1 8 53 hg38_gold_U07000.1 1879 1932 0x1747410 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:22281:3587_2 hg38_gold_U07000.1 8 52 hg38_gold_U07000.1 1879 1944 0x174d920 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:9129:3504_2 hg38_gold_U07000.1 13 50 hg38_gold_U07000.1 1872 1932 0x1748e60 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:10154:5887_2 hg38_gold_U07000.1 10 41 hg38_gold_U07000.1 1872 1931 0x17499d0 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:9382:5932_2 hg38_gold_U07000.1 8 53 hg38_gold_U07000.1 1871 1921 0x174d580 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:22747:6442_2 hg38_gold_U07000.1 8 52 hg38_gold_U07000.1 1870 1933 0x1748cc0 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:6411:6050_2 hg38_gold_U07000.1 13 49 hg38_gold_U07000.1 1868 1932 0x17486a0 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:23865:5544_2 hg38_gold_U07000.1 13 56 hg38_gold_U07000.1 1868 1923 0x1748480 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:20349:5252_2 hg38_gold_U07000.1 13 51 hg38_gold_U07000.1 1868 1931 0x174ce80 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:22571:4203_2 hg38_gold_U07000.1 13 50 hg38_gold_U07000.1 1868 1926 0x1745050 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:16719:4051_2 hg38_gold_U07000.1 8 51 hg38_gold_U07000.1 1868 1932 0x1747b70 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:8961:3182_2 hg38_gold_U07000.1 15 51 hg38_gold_U07000.1 1868 1917 0x1744130 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:18427:2937_2 hg38_gold_U07000.1 8 41 hg38_gold_U07000.1 1868 1932 0x174b760 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:21139:5913_2 hg38_gold_U07000.1 12 50 hg38_gold_U07000.1 1868 1924 0x174d640 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:10523:5853_2 hg38_gold_U07000.1 8 50 hg38_gold_U07000.1 1868 1935 0x174de60 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:16433:5445_2 hg38_gold_U07000.1 13 51 hg38_gold_U07000.1 1868 1932 0x174d870 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:12490:4165_2 hg38_gold_U07000.1 13 58 hg38_gold_U07000.1 1868 1917 0x174d9f0 0 - + id:1 weight:1 +hg38_gold_U07000.1 14 1_1 N [hg38_gold_U07000.1:1876[N . . SVTYPE=BND;STRANDS=--:19;EVENT=1;MATEID=1_2;CIPOS=0,0;CIEND=0,2;CIPOS95=0,0;CIEND95=0,0;SU=19;SR=19;PRPOS=1;PREND=0.99996,3.98091e-05,1.58483e-09 GT:SU:SR ./.:19:19 +hg38_gold_U07000.1 1876 1_2 N [hg38_gold_U07000.1:14[N . . SVTYPE=BND;STRANDS=--:19;SECONDARY;EVENT=1;MATEID=1_1;CIPOS=0,2;CIEND=0,0;CIPOS95=0,0;CIEND95=0,0;SU=19;SR=19;PRPOS=0.99996,3.98091e-05,1.58483e-09;PREND=1 GT:SU:SR ./.:19:19 + Evidence: M00860:26:000000000-A6UGV:1:1101:7043:5583_2 hg38_gold_U07000.1 13 56 hg38_gold_U07000.1 1899 1935 0x1742010 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:19472:2185_2 hg38_gold_U07000.1 0 56 hg38_gold_U07000.1 1898 1945 0x1744270 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:9324:6462_2 hg38_gold_U07000.1 0 51 hg38_gold_U07000.1 1898 1933 0x1748fd0 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:23764:6273_2 hg38_gold_U07000.1 13 48 hg38_gold_U07000.1 1898 1945 0x17490a0 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:7772:5906_2 hg38_gold_U07000.1 0 54 hg38_gold_U07000.1 1898 1942 0x1747240 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:6971:4906_2 hg38_gold_U07000.1 13 51 hg38_gold_U07000.1 1898 1935 0x1746170 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:10511:4776_2 hg38_gold_U07000.1 8 50 hg38_gold_U07000.1 1898 1934 0x174a840 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:19677:3538_2 hg38_gold_U07000.1 13 51 hg38_gold_U07000.1 1898 1940 0x174c190 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:22109:4922_2 hg38_gold_U07000.1 13 53 hg38_gold_U07000.1 1897 1933 0x1743ac0 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:14027:4470_2 hg38_gold_U07000.1 0 51 hg38_gold_U07000.1 1896 1933 0x174b500 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:22736:5607_2 hg38_gold_U07000.1 13 57 hg38_gold_U07000.1 1893 1945 0x174cf30 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:10768:5508_2 hg38_gold_U07000.1 13 56 hg38_gold_U07000.1 1893 1935 0x17480b0 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:14449:3437_2 hg38_gold_U07000.1 13 48 hg38_gold_U07000.1 1893 1933 0x174b150 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:18009:3360_2 hg38_gold_U07000.1 13 47 hg38_gold_U07000.1 1893 1925 0x1749390 0 + - id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:16615:6513_2 hg38_gold_U07000.1 13 51 hg38_gold_U07000.1 1893 1931 0x174b860 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:19235:5076_2 hg38_gold_U07000.1 13 45 hg38_gold_U07000.1 1893 1932 0x174a790 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:14629:4828_2 hg38_gold_U07000.1 8 50 hg38_gold_U07000.1 1893 1932 0x174b360 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:22780:4994_2 hg38_gold_U07000.1 13 50 hg38_gold_U07000.1 1892 1924 0x174aaf0 0 - + id:1 weight:1 + Evidence: M00860:26:000000000-A6UGV:1:1101:12387:3929_2 hg38_gold_U07000.1 13 51 hg38_gold_U07000.1 1892 1931 0x174c000 0 - + id:1 weight:1 +hg38_gold_U07000.1 10 2_1 N [hg38_gold_U07000.1:1897[N . . SVTYPE=BND;STRANDS=--:19;EVENT=2;MATEID=2_2;CIPOS=-1,0;CIEND=-7,5;CIPOS95=0,1;CIEND95=-2,1;IMPRECISE;SU=19;SR=19;PRPOS=9.99999e-13,9.99999e-07;PREND=4.89496e-31,1.94872e-23,7.75799e-16,3.08851e-08,0.0308851,0.0775799,0.194872,0.489496,0.194872,0.0122956,1.94872e-09,4.89496e-17,1.22956e-24 GT:SU:SR ./.:19:19 +hg38_gold_U07000.1 1897 2_2 N [hg38_gold_U07000.1:10[N . . SVTYPE=BND;STRANDS=--:19;SECONDARY;EVENT=2;MATEID=2_1;CIPOS=-7,5;CIEND=-1,0;CIPOS95=-2,1;CIEND95=0,1;IMPRECISE;SU=19;SR=19;PRPOS=4.89496e-31,1.94872e-23,7.75799e-16,3.08851e-08,0.0308851,0.0775799,0.194872,0.489496,0.194872,0.0122956,1.94872e-09,4.89496e-17,1.22956e-24;PREND=9.99999e-13,9.99999e-07 GT:SU:SR ./.:19:19
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_two.vcf Mon Jul 24 08:03:17 2017 -0400 @@ -0,0 +1,37 @@ +##fileformat=VCFv4.2 +##source=LUMPY +##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> +##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles"> +##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> +##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)"> +##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation"> +##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> +##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> +##INFO=<ID=CIPOS95,Number=2,Type=Integer,Description="Confidence interval (95%) around POS for imprecise variants"> +##INFO=<ID=CIEND95,Number=2,Type=Integer,Description="Confidence interval (95%) around END for imprecise variants"> +##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends"> +##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend"> +##INFO=<ID=SECONDARY,Number=0,Type=Flag,Description="Secondary breakend in a multi-line variants"> +##INFO=<ID=SU,Number=.,Type=Integer,Description="Number of pieces of evidence supporting the variant across all samples"> +##INFO=<ID=PE,Number=.,Type=Integer,Description="Number of paired-end reads supporting the variant across all samples"> +##INFO=<ID=SR,Number=.,Type=Integer,Description="Number of split reads supporting the variant across all samples"> +##INFO=<ID=BD,Number=.,Type=Integer,Description="Amount of BED evidence supporting the variant across all samples"> +##INFO=<ID=EV,Number=.,Type=String,Description="Type of LUMPY evidence contributing to the variant call"> +##INFO=<ID=PRPOS,Number=.,Type=String,Description="LUMPY probability curve of the POS breakend"> +##INFO=<ID=PREND,Number=.,Type=String,Description="LUMPY probability curve of the END breakend"> +##ALT=<ID=DEL,Description="Deletion"> +##ALT=<ID=DUP,Description="Duplication"> +##ALT=<ID=INV,Description="Inversion"> +##ALT=<ID=DUP:TANDEM,Description="Tandem duplication"> +##ALT=<ID=INS,Description="Insertion of novel sequence"> +##ALT=<ID=CNV,Description="Copy number variable region"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant"> +##FORMAT=<ID=PE,Number=1,Type=Integer,Description="Number of paired-end reads supporting the variant"> +##FORMAT=<ID=SR,Number=1,Type=Integer,Description="Number of split reads supporting the variant"> +##FORMAT=<ID=BD,Number=1,Type=Integer,Description="Amount of BED evidence supporting the variant"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sr_input_bam_a sr_input_bam_b +hg38_gold_U07000.1 14 1_1 N [hg38_gold_U07000.1:1876[N . . SVTYPE=BND;STRANDS=--:38;EVENT=1;MATEID=1_2;CIPOS=0,0;CIEND=0,0;CIPOS95=0,0;CIEND95=0,0;SU=38;SR=38 GT:SU:SR ./.:19:19 ./.:19:19 +hg38_gold_U07000.1 1876 1_2 N [hg38_gold_U07000.1:14[N . . SVTYPE=BND;STRANDS=--:38;SECONDARY;EVENT=1;MATEID=1_1;CIPOS=0,0;CIEND=0,0;CIPOS95=0,0;CIEND95=0,0;SU=38;SR=38 GT:SU:SR ./.:19:19 ./.:19:19 +hg38_gold_U07000.1 10 2_1 N [hg38_gold_U07000.1:1897[N . . SVTYPE=BND;STRANDS=--:38;EVENT=2;MATEID=2_2;CIPOS=0,0;CIEND=-7,5;CIPOS95=0,1;CIEND95=-1,1;IMPRECISE;SU=38;SR=38 GT:SU:SR ./.:19:19 ./.:19:19 +hg38_gold_U07000.1 1897 2_2 N [hg38_gold_U07000.1:10[N . . SVTYPE=BND;STRANDS=--:38;SECONDARY;EVENT=2;MATEID=2_1;CIPOS=-7,5;CIEND=0,0;CIPOS95=-1,1;CIEND95=0,1;IMPRECISE;SU=38;SR=38 GT:SU:SR ./.:19:19 ./.:19:19