Mercurial > repos > artbio > lumpy_sv

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extractSplitReads_BwaMem.py	Mon Jul 24 08:03:17 2017 -0400
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+
+import sys
+import getopt
+import string
+from optparse import OptionParser
+import re
+
+def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap):
+    if inFile == "stdin":
+        data = sys.stdin
+    else:
+        data = open(inFile, 'r')
+    for line in data:
+        split = 0
+        if line[0] == '@':
+            print line.strip()
+            continue
+        samList = line.strip().split('\t')
+        sam = SAM(samList)
+        if includeDups==0 and (1024 & sam.flag)==1024:
+            continue
+        for el in sam.tags:
+            if "SA:" in el:
+                if(len(el.split(";")))<=numSplits:
+                    split = 1
+                    mate = el.split(",")
+                    mateCigar = mate[3]
+                    mateFlag = int(0)
+                    if mate[2]=="-": mateFlag = int(16)
+        if split:
+            read1 = sam.flag & 64
+            if read1 == 64: tag = "_1"
+            else: tag="_2"
+            samList[0] = sam.query + tag
+            readCigar = sam.cigar
+            readCigarOps = extractCigarOps(readCigar,sam.flag)
+            readQueryPos = calcQueryPosFromCigar(readCigarOps)
+            mateCigarOps = extractCigarOps(mateCigar,mateFlag)
+            mateQueryPos = calcQueryPosFromCigar(mateCigarOps)
+            overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos)
+            nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap
+            nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap
+            mno = min(nonOverlap1, nonOverlap2)
+            if mno >= minNonOverlap:
+                print "\t".join(samList)
+
+#--------------------------------------------------------------------------------------------------
+# functions
+#--------------------------------------------------------------------------------------------------
+
+class SAM (object):
+    """
+    __very__ basic class for SAM input.
+    """
+    def __init__(self, samList = []):
+        if len(samList) > 0:
+            self.query    = samList[0]
+            self.flag     = int(samList[1])
+            self.ref      = samList[2]
+            self.pos      = int(samList[3])
+            self.mapq     = int(samList[4])
+            self.cigar    = samList[5]
+            self.matRef   = samList[6]
+            self.matePos  = int(samList[7])
+            self.iSize    = int(samList[8])
+            self.seq      = samList[9]
+            self.qual     = samList[10]
+            self.tags     = samList[11:]#tags is a list of each tag:vtype:value sets
+            self.valid    = 1
+        else:
+            self.valid = 0
+            self.query = 'null'
+
+    def extractTagValue (self, tagID):
+        for tag in self.tags:
+            tagParts = tag.split(':', 2);
+            if (tagParts[0] == tagID):
+                if (tagParts[1] == 'i'):
+                    return int(tagParts[2]);
+                elif (tagParts[1] == 'H'):
+                    return int(tagParts[2],16);
+                return tagParts[2];
+        return None;
+
+#-----------------------------------------------
+cigarPattern = '([0-9]+[MIDNSHP])'
+cigarSearch = re.compile(cigarPattern)
+atomicCigarPattern = '([0-9]+)([MIDNSHP])'
+atomicCigarSearch = re.compile(atomicCigarPattern)
+
+def extractCigarOps(cigar,flag):
+    if (cigar == "*"):
+        cigarOps = []
+    elif (flag & 0x0010):
+        cigarOpStrings = cigarSearch.findall(cigar)
+        cigarOps = []
+        for opString in cigarOpStrings:
+            cigarOpList = atomicCigarSearch.findall(opString)
+#            print cigarOpList
+            # "struct" for the op and it's length
+            cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1])
+            # add to the list of cigarOps
+            cigarOps.append(cigar)
+            cigarOps = cigarOps
+        cigarOps.reverse()
+        ##do in reverse order because negative strand##
+    else:
+        cigarOpStrings = cigarSearch.findall(cigar)
+        cigarOps = []
+        for opString in cigarOpStrings:
+            cigarOpList = atomicCigarSearch.findall(opString)
+            # "struct" for the op and it's length
+            cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1])
+            # add to the list of cigarOps
+            cigarOps.append(cigar)
+#            cigarOps = cigarOps
+    return(cigarOps)
+
+def calcQueryPosFromCigar(cigarOps):
+    qsPos = 0
+    qePos = 0
+    qLen  = 0
+    # if first op is a H, need to shift start position
+    # the opPosition counter sees if the for loop is looking at the first index of the cigar object
+    opPosition = 0
+    for cigar in cigarOps:
+        if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'):
+            qsPos += cigar.length
+            qePos += cigar.length
+            qLen  += cigar.length
+        elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'):
+            qLen  += cigar.length
+        elif cigar.op == 'M' or cigar.op == 'I':
+            qePos += cigar.length
+            qLen  += cigar.length
+            opPosition += 1
+    d = queryPos(qsPos, qePos, qLen);
+    return d
+
+class cigarOp (object):
+    """
+    sturct to store a discrete CIGAR operations
+    """
+    def __init__(self, opLength, op):
+        self.length = int(opLength)
+        self.op     = op
+
+class queryPos (object):
+    """
+    struct to store the start and end positions of query CIGAR operations
+    """
+    def __init__(self, qsPos, qePos, qLen):
+        self.qsPos = int(qsPos)
+        self.qePos = int(qePos)
+        self.qLen  = int(qLen)
+
+
+def calcQueryOverlap(s1,e1,s2,e2):
+    o = 1 + min(e1, e2) - max(s1, s2)
+    return max(0, o)
+
+###############################################
+
+class Usage(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+
+def main():
+
+    usage = """%prog -i <file>
+
+extractSplitReads_BwaMem v0.1.0
+Author: Ira Hall
+Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates.
+Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405.
+    """
+    parser = OptionParser(usage)
+
+    parser.add_option("-i", "--inFile", dest="inFile",
+        help="A SAM file or standard input (-i stdin).",
+        metavar="FILE")
+    parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int",
+        help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2",
+        metavar="INT")
+    parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0,
+        help="Include alignments marked as duplicates. Default=False")
+    parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int",
+        help="minimum non-overlap between split alignments on the query (default=20)",
+        metavar="INT")
+    (opts, args) = parser.parse_args()
+    if opts.inFile is None:
+        parser.print_help()
+        print
+    else:
+        try:
+            extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap)
+        except IOError as err:
+            sys.stderr.write("IOError " + str(err) + "\n");
+            return
+if __name__ == "__main__":
+    sys.exit(main())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lumpy.xml	Mon Jul 24 08:03:17 2017 -0400
@@ -0,0 +1,284 @@
+<tool id="lumpy" name="lumpy-sv" version="1.0.0">
+    <description>find structural variants</description>
+    <requirements>
+        <requirement type="package" version="0.2.13">lumpy-sv</requirement>
+        <requirement type="package" version="1.3.1">samtools</requirement>
+        <requirement type="package" version="1.11.2=py27_0">numpy</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Tool exception" />
+    </stdio>
+    <command detect_errors="exit_code"><![CDATA[
+        #import re
+        #set one_sample_bam = re.sub('[^\w\-]', '_', str($analysis_type.input_file.element_identifier))
+        #if $analysis_type.analysis_type_list == "one_sample":
+            ln -f -s '$analysis_type.input_file' '$one_sample_bam' &&
+        #else:
+            #set sample_a_bam = re.sub('[^\w\-]', '_', str($analysis_type.input_file.element_identifier))
+            #set sample_b_bam = re.sub('[^\w\-]', '_', str($analysis_type.input_fileB.element_identifier))
+            #if $sample_a_bam == $sample_b_bam:
+                #set sample_a_bam = "%s_a" % str($sample_a_bam)
+                #set sample_b_bam = "%s_b" % str($sample_b_bam)
+            #end if
+            ln -f -s '$analysis_type.input_file' '$sample_a_bam' &&
+            ln -f -s '$analysis_type.input_fileB' '$sample_b_bam' &&
+        #end if
+
+        #if $analysis_type.analysis_type_list == "one_sample":
+
+            #if $seq_method.seq_method_list == "paired-end":
+                samtools view -u -F 1294 '$one_sample_bam' | samtools sort -O bam -o input.discordants.bam &&
+                samtools view -h '$one_sample_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.splitters.bam &&
+                samtools view '$one_sample_bam'
+                    |python $__tool_directory__/pairend_distro.py -r $analysis_type.readLength -X 4 -N $seq_method.additional_params.samplingValue -o input.lib.histo > meandev.txt &&
+                mean=\$(cat meandev.txt | sed s/mean:// | sed -r s/stdev:.+//) &&
+                stdev=\$(cat meandev.txt | sed -r s/mean:.+stdev://) &&
+                lumpy $seq_method.additional_params.evidence $seq_method.additional_params.probability_curve -mw $seq_method.additional_params.mw -tt $seq_method.additional_params.tt
+                    #if $output_format == "BEDPE":
+                        -b
+                    #end if
+                    -pe id:'$one_sample_bam',bam_file:input.discordants.bam,histo_file:input.lib.histo,mean:"\$mean",stdev:"\$stdev",read_length:$analysis_type.readLength,min_non_overlap:$seq_method.additional_params.min_non_overlap,discordant_z:$seq_method.additional_params.discordant_z,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold
+                    -sr id:'$one_sample_bam',bam_file:input.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold > '$vcf_call'
+            #elif $seq_method.seq_method_list == "single-read":
+                samtools view -h '$one_sample_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.splitters.bam &&
+                lumpy $seq_method.additional_params.evidence $seq_method.additional_params.probability_curve -mw $seq_method.additional_params.mw -tt $seq_method.additional_params.tt
+                    #if $output_format == "BEDPE":
+                        -b
+                    #end if
+                    -sr id:'$one_sample_bam',bam_file:input.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold > '$vcf_call'
+            #end if
+        #else:
+            #if $seq_method.seq_method_list == "paired-end":
+                samtools view -u -F 1294 '$sample_a_bam' | samtools sort -O bam -o input.discordants.bam &&
+                samtools view -u -F 1294 '$sample_b_bam' | samtools sort -O bam -o input.B.discordants.bam &&
+                samtools view -h '$sample_a_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.splitters.bam &&
+                samtools view -h '$sample_b_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.B.splitters.bam &&
+                samtools view  '$sample_a_bam'
+                    |python $__tool_directory__/pairend_distro.py -r $analysis_type.readLength -X 4 -N $seq_method.additional_params.samplingValue -o input.lib.histo > meandevA.txt &&
+                samtools view  '$sample_b_bam'
+                    |python $__tool_directory__/pairend_distro.py -r $analysis_type.readLengthB -X 4 -N $seq_method.additional_params.samplingValue -o input.B.lib.histo > meandevB.txt &&
+                meanA=\$(cat meandevA.txt | sed s/mean:// | sed -r s/stdev:.+//) &&
+                meanB=\$(cat meandevB.txt | sed s/mean:// | sed -r s/stdev:.+//) &&
+                stdevA=\$(cat meandevA.txt | sed -r s/mean:.+stdev://) &&
+                stdevB=\$(cat meandevB.txt | sed -r s/mean:.+stdev://) &&
+                lumpy $seq_method.additional_params.evidence $seq_method.additional_params.probability_curve -mw $seq_method.additional_params.mw -tt $seq_method.additional_params.tt
+                    #if $output_format == "BEDPE":
+                        -b
+                    #end if
+                    -pe id:inputA.bam,bam_file:input.discordants.bam,histo_file:input.lib.histo,mean:"\$meanA",stdev:"\$stdevA",read_length:$analysis_type.readLength,min_non_overlap:$seq_method.additional_params.min_non_overlap,discordant_z:$seq_method.additional_params.discordant_z,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold
+                    -pe id:inputB.bam,bam_file:input.B.discordants.bam,histo_file:input.B.lib.histo,mean:"\$meanB",stdev:"\$stdevA",read_length:$analysis_type.readLengthB,min_non_overlap:$seq_method.additional_params.min_non_overlap,discordant_z:$seq_method.additional_params.discordant_z,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold
+                    -sr id:inputA.bam,bam_file:input.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold
+                    -sr id:inputB.bam,bam_file:input.B.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold > '$vcf_call'
+            #elif $seq_method.seq_method_list == "single-read":
+                samtools view -h '$sample_a_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.splitters.bam &&
+                samtools view -h '$sample_b_bam' | python $__tool_directory__/extractSplitReads_BwaMem.py -i stdin | samtools sort -O bam -o input.B.splitters.bam &&
+                lumpy $seq_method.additional_params.evidence $seq_method.additional_params.probability_curve -mw $seq_method.additional_params.mw -tt $seq_method.additional_params.tt
+                    #if $output_format == "BEDPE":
+                        -b
+                    #end if
+                    -sr id:'$sample_a_bam',bam_file:input.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold
+                    -sr id:'$sample_b_bam',bam_file:input.B.splitters.bam,back_distance:$seq_method.additional_params.back_distance,weight:$seq_method.additional_params.weight,min_mapping_threshold:$seq_method.additional_params.min_mapping_threshold > '$vcf_call'
+            #end if
+        #end if
+
+    ]]></command>
+    <!-- basic error handling -->
+    <inputs>
+        <conditional name="analysis_type">
+            <param help="Single or paired conditions (eg tumor vs normal)" label="Input(s)" name="analysis_type_list" type="select">
+                <option selected="True" value="one_sample">One Sample</option>
+                <option value="two_sample">Two samples</option>
+            </param>
+            <when value="one_sample">
+                <param format="bam" name="input_file" type="data" label="One BAM alignment file produced by BWA-mem"/>
+                <param name="readLength" value="151"  type="integer" label="read length" help="e.g. 151 nt" />
+            </when>
+            <when value="two_sample">
+                <param format="bam" name="input_file" type="data" label="One BAM alignment file produced by BWA-mem"/>
+                <param name="readLength" value="151"  type="integer" label="read length" help="e.g. 151 nt" />
+                <param format="bam" name="input_fileB" type="data" label="One BAM alignment file produced by BWA-mem"/>
+                <param name="readLengthB" value="151"  type="integer" label="read length" help="e.g. 151 nt" />
+            </when>
+        </conditional>
+        <conditional name="seq_method">
+            <param help="Paired-end or single-read sequencing" label="Sequencing method" name="seq_method_list" type="select">
+                <option selected="True" value="paired-end">Paired-end sequencing</option>
+                <option value="single-read">Single-read sequencing</option>
+            </param>
+            <when value="paired-end">
+                <section name="additional_params" title="Additional Options" expanded="False">
+                    <param name="samplingValue" value="100000"  type="integer" label="number of reads to compute mean and stdev of read length" help="e.g. 10000" />
+                    <param name="mw" value="4"  type="integer" label="-mw" help="minimum weight across all samples for a call (default: 4)" />
+                    <param name="tt" value="0"  type="integer" label="-tt" help="trim threshold (default: 0)" />
+                    <param name="min_non_overlap" value="101"  type="integer" label="min_non_overlap" help="e.g. 101" />
+                    <param name="discordant_z" value="5"  type="integer" label="discordant_z" help="e.g. 5" />
+                    <param name="back_distance" value="10"  type="integer" label="back_distance" help="e.g. 10" />
+                    <param name="weight" value="1"  type="integer" label="weight" help="e.g. 1" />
+                    <param name="min_mapping_threshold" value="20"  type="integer" label="min_mapping_threshold" help="e.g. 20" />
+                    <param name="probability_curve" argument="-P" type="boolean" truevalue="-P" falsevalue="" checked="true" label="output probability curve for each variant"/>
+                    <param name="evidence" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="true" label="show evidence for each call"/>
+                </section>
+            </when>
+            <when value="single-read">
+                <section name="additional_params" title="Additional Options" expanded="False">
+                    <param name="mw" value="4"  type="integer" label="-mw" help="minimum weight across all samples for a call (default: 4)" />
+                    <param name="tt" value="0"  type="integer" label="-tt" help="trim threshold (default: 0)" />
+                    <param name="back_distance" value="10"  type="integer" label="back_distance" help="e.g. 10" />
+                    <param name="weight" value="1"  type="integer" label="weight" help="e.g. 1" />
+                    <param name="min_mapping_threshold" value="20"  type="integer" label="min_mapping_threshold" help="e.g. 20" />
+                    <param name="probability_curve" argument="-P" type="boolean" truevalue="-P" falsevalue="" checked="false" label="output probability curve for each variant"/>
+                    <param name="evidence" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="false" label="show evidence for each call"/>
+                </section>
+            </when>
+
+        </conditional>
+            <param help="get variant calling in vcf or BEDPE format" label="variant calling format" name="output_format" type="select">
+                <option selected="True" value="vcf">vcf</option>
+                <option value="BEDPE">BEDPE</option>
+            </param>
+    </inputs>
+
+    <outputs>
+        <data format="tabular" name="histogram" label="Lumpy on ${on_string}: Fragment size distribution" from_work_dir="input.lib.histo">
+            <filter>seq_method['seq_method_list'] == "paired-end"</filter>
+        </data>
+        <data format="tabular" name="histogramB" label="Lumpy on ${on_string}: Fragment size distribution" from_work_dir="input.B.lib.histo">
+            <filter>seq_method['seq_method_list'] == "paired-end"</filter>
+            <filter>analysis_type['analysis_type_list'] == "two_sample"</filter>
+        </data>
+        <data format="bam" name="splits" label="Lumpy on ${on_string}: Split Reads (Bam format)" from_work_dir="input.splitters.bam"/>
+        <data format="bam" name="splitsB" label="Lumpy on ${on_string}: Split Reads (Bam format)" from_work_dir="input.B.splitters.bam">
+            <filter>analysis_type['analysis_type_list'] == "two_sample"</filter>
+        </data>
+        <data format="bam" name="discordants" label="Lumpy on ${on_string}: Discordant Pairs (Bam format)" from_work_dir="input.discordants.bam">
+            <filter>seq_method['seq_method_list'] == "paired-end"</filter>
+        </data>
+        <data format="bam" name="discordantsB" label="Lumpy on ${on_string}: Discordant Pairs (Bam format)" from_work_dir="input.discordants.B.bam">
+            <filter>seq_method['seq_method_list'] == "paired-end"</filter>
+            <filter>analysis_type['analysis_type_list'] == "two_sample"</filter>
+        </data>
+        <data format="vcf" name="vcf_call" label="Lumpy Variant Calling">
+            <change_format>
+                <when format="tabular" input="output_format" value="BEDPE" />
+            </change_format>
+        </data>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="analysis_type_list" value="one_sample" />
+            <param name="input_file" value="sr.input.bam" ftype="bam"/>
+            <param name="seq_method_list" value="single-read" />
+            <param name="mw" value="4"/>
+            <param name="tt" value="0"/>
+            <param name="back_distance" value="10"/>
+            <param name="weight" value="1" />
+            <param name="min_mapping_threshold" value="20" />
+            <output name="vcf_call" file="output.vcf" ftype="vcf"/>
+        </test>
+        <test>
+            <param name="analysis_type_list" value="one_sample" />
+            <param name="input_file" value="sr.input.bam" ftype="bam"/>
+            <param name="seq_method_list" value="single-read" />
+            <param name="mw" value="4"/>
+            <param name="tt" value="0"/>
+            <param name="back_distance" value="10"/>
+            <param name="weight" value="1" />
+            <param name="min_mapping_threshold" value="20" />
+            <param name="evidence" value="true" />
+            <param name="probability_curve" value="true" />
+            <output name="vcf_call" file="output_extended.vcf" ftype="vcf" compare="sim_size"/>
+        </test>
+        <test>
+            <param name="analysis_type_list" value="two_sample" />
+            <param name="input_file" value="sr.input.bam" ftype="bam"/>
+            <param name="input_fileB" value="sr.input.bam" ftype="bam"/>
+            <param name="seq_method_list" value="single-read" />
+            <param name="mw" value="4"/>
+            <param name="tt" value="0"/>
+            <param name="back_distance" value="10"/>
+            <param name="weight" value="1" />
+            <param name="min_mapping_threshold" value="20" />
+            <output name="vcf_call" file="output_two.vcf" ftype="vcf"/>
+        </test>
+   </tests>
+
+    <help>
+
+**Input(s)**
+
+*One sample* : lumpy search structural variations inside a single sequencing dataset
+
+*Two samples*: lumpy search structural variations inside and across two sequencing datasets from two samples
+
+Analysis of sample replicates is not implemented yet in this wrapper
+
+*BAM files*: Only BAM alignments produced by BWA-mem have been tested with this tool
+
+**Sequencing method**
+
+*Paired-end sequencing*: Both ends of library fragments have been sequenced, resulting in two paired sequencing datasets
+
+*Single-read sequencing*: Only one end of library fragment has been sequenced, resulting in a single sequencing dataset. Under these conditions, evidences of structural variation are obtained only from splited read alignments
+
+*Read length*: The length of the sequencing reads in the library. This information is required only for paired-end sequencing data
+
+*Additional options*: refer to lumpy-sv_ documentation and the publication (doi 10.1186/gb-2014-15-6-r84)
+
+**lumpy-sv manual**
+
+Read the lumpy-sv_ documentation for details on using lumpy.
+
+.. _lumpy-sv: https://github.com/arq5x/lumpy-sv
+
+**lumpy options**
+
+v 0.2.13
+Author:  Ryan Layer (rl6sf@virginia.edu)
+
+Summary: Find structural variations in various signals.
+
+Options::
+<![CDATA[
+
+	-g	Genome file (defines chromosome order)
+	-e	Show evidence for each call
+	-w	File read windows size (default 1000000)
+	-mw	minimum weight for a call
+	-msw	minimum per-sample weight for a call
+	-tt	trim threshold
+	-x	exclude file bed file
+	-t	temp file prefix, must be to a writeable directory
+	-P	output probability curve for each variant
+	-b	output BEDPE instead of VCF
+	-sr	bam_file:<file name>,
+		id:<sample name>,
+		back_distance:<distance>,
+		min_mapping_threshold:<mapping quality>,
+		weight:<sample weight>,
+		min_clip:<minimum clip length>,
+		read_group:<string>
+
+	-pe	bam_file:<file name>,
+		id:<sample name>,
+		histo_file:<file name>,
+		mean:<value>,
+		stdev:<value>,
+		read_length:<length>,
+		min_non_overlap:<length>,
+		discordant_z:<z value>,
+		back_distance:<distance>,
+		min_mapping_threshold:<mapping quality>,
+		weight:<sample weight>,
+		read_group:<string>
+
+	-bedpe	bedpe_file:<bedpe file>,
+		id:<sample name>,
+		weight:<sample weight>
+]]>
+    </help>
+
+    <citations>
+    <citation type="doi">10.1186/gb-2014-15-6-r84</citation>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pairend_distro.py	Mon Jul 24 08:03:17 2017 -0400
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+#  (c) 2012 - Ryan M. Layer
+#  Hall Laboratory
+#  Quinlan Laboratory
+#  Department of Computer Science
+#  Department of Biochemistry and Molecular Genetics
+#  Department of Public Health Sciences and Center for Public Health Genomics,
+#  University of Virginia
+#  rl6sf@virginia.edu
+
+import sys
+import numpy as np
+from operator import itemgetter
+from optparse import OptionParser
+
+# some constants for sam/bam field ids
+SAM_FLAG = 1
+SAM_REFNAME = 2
+SAM_MATE_REFNAME = 6
+SAM_ISIZE = 8
+
+parser = OptionParser()
+
+parser.add_option("-r",
+    "--read_length",
+    type="int",
+    dest="read_length",
+    help="Read length")
+
+parser.add_option("-X",
+    dest="X",
+    type="int",
+    help="Number of stdevs from mean to extend")
+
+parser.add_option("-N",
+    dest="N",
+    type="int",
+    help="Number to sample")
+
+parser.add_option("-o",
+    dest="output_file",
+    help="Output file")
+
+parser.add_option("-m",
+    dest="mads",
+    type="int",
+    default=10,
+    help="Outlier cutoff in # of median absolute deviations (unscaled, upper only)")
+
+def unscaled_upper_mad(xs):
+    """Return a tuple consisting of the median of xs followed by the
+    unscaled median absolute deviation of the values in xs that lie
+    above the median.
+    """
+    med = np.median(xs)
+    return med, np.median(xs[xs > med] - med)
+
+
+(options, args) = parser.parse_args()
+
+if not options.read_length:
+    parser.error('Read length not given')
+
+if not options.X:
+    parser.error('X not given')
+
+if not options.N:
+    parser.error('N not given')
+
+if not options.output_file:
+    parser.error('Output file not given')
+
+
+required = 97
+restricted = 3484
+flag_mask = required | restricted
+
+L = []
+c = 0
+
+for l in sys.stdin:
+    if c >= options.N:
+        break
+
+    A = l.rstrip().split('\t')
+    flag = int(A[SAM_FLAG])
+    refname = A[SAM_REFNAME]
+    mate_refname = A[SAM_MATE_REFNAME]
+    isize = int(A[SAM_ISIZE])
+
+    want = mate_refname == "=" and flag & flag_mask == required and isize >= 0
+    if want:
+        c += 1
+        L.append(isize)
+
+# warn if very few elements in distribution
+min_elements = 1000
+if len(L) < min_elements:
+    sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % (len(L), min_elements))
+    mean = "NA"
+    stdev = "NA"
+
+else:
+    # Remove outliers
+    L = np.array(L)
+    L.sort()
+    med, umad = unscaled_upper_mad(L)
+    upper_cutoff = med + options.mads * umad
+    L = L[L < upper_cutoff]
+    new_len = len(L)
+    removed = c - new_len
+    sys.stderr.write("Removed %d outliers with isize >= %d\n" %
+        (removed, upper_cutoff))
+    c = new_len
+
+    mean = np.mean(L)
+    stdev = np.std(L)
+
+    start = options.read_length
+    end = int(mean + options.X*stdev)
+
+    H = [0] * (end - start + 1)
+    s = 0
+
+    for x in L:
+        if (x >= start) and (x <= end):
+            j = int(x - start)
+            H[j] = H[ int(x - start) ] + 1
+            s += 1
+
+    f = open(options.output_file, 'w')
+
+    for i in range(end - start):
+        o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n"
+        f.write(o)
+
+
+    f.close()
+
+print('mean:' + str(mean) + '\tstdev:' + str(stdev))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.vcf	Mon Jul 24 08:03:17 2017 -0400
@@ -0,0 +1,37 @@
+##fileformat=VCFv4.2
+##source=LUMPY
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
+##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
+##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
+##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
+##INFO=<ID=CIPOS95,Number=2,Type=Integer,Description="Confidence interval (95%) around POS for imprecise variants">
+##INFO=<ID=CIEND95,Number=2,Type=Integer,Description="Confidence interval (95%) around END for imprecise variants">
+##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
+##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
+##INFO=<ID=SECONDARY,Number=0,Type=Flag,Description="Secondary breakend in a multi-line variants">
+##INFO=<ID=SU,Number=.,Type=Integer,Description="Number of pieces of evidence supporting the variant across all samples">
+##INFO=<ID=PE,Number=.,Type=Integer,Description="Number of paired-end reads supporting the variant across all samples">
+##INFO=<ID=SR,Number=.,Type=Integer,Description="Number of split reads supporting the variant across all samples">
+##INFO=<ID=BD,Number=.,Type=Integer,Description="Amount of BED evidence supporting the variant across all samples">
+##INFO=<ID=EV,Number=.,Type=String,Description="Type of LUMPY evidence contributing to the variant call">
+##INFO=<ID=PRPOS,Number=.,Type=String,Description="LUMPY probability curve of the POS breakend">
+##INFO=<ID=PREND,Number=.,Type=String,Description="LUMPY probability curve of the END breakend">
+##ALT=<ID=DEL,Description="Deletion">
+##ALT=<ID=DUP,Description="Duplication">
+##ALT=<ID=INV,Description="Inversion">
+##ALT=<ID=DUP:TANDEM,Description="Tandem duplication">
+##ALT=<ID=INS,Description="Insertion of novel sequence">
+##ALT=<ID=CNV,Description="Copy number variable region">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">
+##FORMAT=<ID=PE,Number=1,Type=Integer,Description="Number of paired-end reads supporting the variant">
+##FORMAT=<ID=SR,Number=1,Type=Integer,Description="Number of split reads supporting the variant">
+##FORMAT=<ID=BD,Number=1,Type=Integer,Description="Amount of BED evidence supporting the variant">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sr_input_bam
+hg38_gold_U07000.1	14	1_1	N	[hg38_gold_U07000.1:1876[N	.	.	SVTYPE=BND;STRANDS=--:19;EVENT=1;MATEID=1_2;CIPOS=0,0;CIEND=0,2;CIPOS95=0,0;CIEND95=0,0;SU=19;SR=19	GT:SU:SR	./.:19:19
+hg38_gold_U07000.1	1876	1_2	N	[hg38_gold_U07000.1:14[N	.	.	SVTYPE=BND;STRANDS=--:19;SECONDARY;EVENT=1;MATEID=1_1;CIPOS=0,2;CIEND=0,0;CIPOS95=0,0;CIEND95=0,0;SU=19;SR=19	GT:SU:SR	./.:19:19
+hg38_gold_U07000.1	10	2_1	N	[hg38_gold_U07000.1:1897[N	.	.	SVTYPE=BND;STRANDS=--:19;EVENT=2;MATEID=2_2;CIPOS=-1,0;CIEND=-7,5;CIPOS95=0,1;CIEND95=-2,1;IMPRECISE;SU=19;SR=19	GT:SU:SR	./.:19:19
+hg38_gold_U07000.1	1897	2_2	N	[hg38_gold_U07000.1:10[N	.	.	SVTYPE=BND;STRANDS=--:19;SECONDARY;EVENT=2;MATEID=2_1;CIPOS=-7,5;CIEND=-1,0;CIPOS95=-2,1;CIEND95=0,1;IMPRECISE;SU=19;SR=19	GT:SU:SR	./.:19:19
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_extended.vcf	Mon Jul 24 08:03:17 2017 -0400
@@ -0,0 +1,75 @@
+##fileformat=VCFv4.2
+##source=LUMPY
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
+##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
+##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
+##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
+##INFO=<ID=CIPOS95,Number=2,Type=Integer,Description="Confidence interval (95%) around POS for imprecise variants">
+##INFO=<ID=CIEND95,Number=2,Type=Integer,Description="Confidence interval (95%) around END for imprecise variants">
+##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
+##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
+##INFO=<ID=SECONDARY,Number=0,Type=Flag,Description="Secondary breakend in a multi-line variants">
+##INFO=<ID=SU,Number=.,Type=Integer,Description="Number of pieces of evidence supporting the variant across all samples">
+##INFO=<ID=PE,Number=.,Type=Integer,Description="Number of paired-end reads supporting the variant across all samples">
+##INFO=<ID=SR,Number=.,Type=Integer,Description="Number of split reads supporting the variant across all samples">
+##INFO=<ID=BD,Number=.,Type=Integer,Description="Amount of BED evidence supporting the variant across all samples">
+##INFO=<ID=EV,Number=.,Type=String,Description="Type of LUMPY evidence contributing to the variant call">
+##INFO=<ID=PRPOS,Number=.,Type=String,Description="LUMPY probability curve of the POS breakend">
+##INFO=<ID=PREND,Number=.,Type=String,Description="LUMPY probability curve of the END breakend">
+##ALT=<ID=DEL,Description="Deletion">
+##ALT=<ID=DUP,Description="Duplication">
+##ALT=<ID=INV,Description="Inversion">
+##ALT=<ID=DUP:TANDEM,Description="Tandem duplication">
+##ALT=<ID=INS,Description="Insertion of novel sequence">
+##ALT=<ID=CNV,Description="Copy number variable region">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">
+##FORMAT=<ID=PE,Number=1,Type=Integer,Description="Number of paired-end reads supporting the variant">
+##FORMAT=<ID=SR,Number=1,Type=Integer,Description="Number of split reads supporting the variant">
+##FORMAT=<ID=BD,Number=1,Type=Integer,Description="Amount of BED evidence supporting the variant">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sr_input_bam
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:22421:6659_2	hg38_gold_U07000.1	8	50	hg38_gold_U07000.1	1885	1932	0x17456f0	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:21503:6234_2	hg38_gold_U07000.1	13	52	hg38_gold_U07000.1	1885	1933	0x17451d0	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:14108:4338_2	hg38_gold_U07000.1	8	53	hg38_gold_U07000.1	1879	1932	0x1747410	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:22281:3587_2	hg38_gold_U07000.1	8	52	hg38_gold_U07000.1	1879	1944	0x174d920	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:9129:3504_2	hg38_gold_U07000.1	13	50	hg38_gold_U07000.1	1872	1932	0x1748e60	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:10154:5887_2	hg38_gold_U07000.1	10	41	hg38_gold_U07000.1	1872	1931	0x17499d0	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:9382:5932_2	hg38_gold_U07000.1	8	53	hg38_gold_U07000.1	1871	1921	0x174d580	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:22747:6442_2	hg38_gold_U07000.1	8	52	hg38_gold_U07000.1	1870	1933	0x1748cc0	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:6411:6050_2	hg38_gold_U07000.1	13	49	hg38_gold_U07000.1	1868	1932	0x17486a0	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:23865:5544_2	hg38_gold_U07000.1	13	56	hg38_gold_U07000.1	1868	1923	0x1748480	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:20349:5252_2	hg38_gold_U07000.1	13	51	hg38_gold_U07000.1	1868	1931	0x174ce80	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:22571:4203_2	hg38_gold_U07000.1	13	50	hg38_gold_U07000.1	1868	1926	0x1745050	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:16719:4051_2	hg38_gold_U07000.1	8	51	hg38_gold_U07000.1	1868	1932	0x1747b70	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:8961:3182_2	hg38_gold_U07000.1	15	51	hg38_gold_U07000.1	1868	1917	0x1744130	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:18427:2937_2	hg38_gold_U07000.1	8	41	hg38_gold_U07000.1	1868	1932	0x174b760	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:21139:5913_2	hg38_gold_U07000.1	12	50	hg38_gold_U07000.1	1868	1924	0x174d640	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:10523:5853_2	hg38_gold_U07000.1	8	50	hg38_gold_U07000.1	1868	1935	0x174de60	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:16433:5445_2	hg38_gold_U07000.1	13	51	hg38_gold_U07000.1	1868	1932	0x174d870	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:12490:4165_2	hg38_gold_U07000.1	13	58	hg38_gold_U07000.1	1868	1917	0x174d9f0	0	-	+	id:1	weight:1
+hg38_gold_U07000.1	14	1_1	N	[hg38_gold_U07000.1:1876[N	.	.	SVTYPE=BND;STRANDS=--:19;EVENT=1;MATEID=1_2;CIPOS=0,0;CIEND=0,2;CIPOS95=0,0;CIEND95=0,0;SU=19;SR=19;PRPOS=1;PREND=0.99996,3.98091e-05,1.58483e-09	GT:SU:SR	./.:19:19
+hg38_gold_U07000.1	1876	1_2	N	[hg38_gold_U07000.1:14[N	.	.	SVTYPE=BND;STRANDS=--:19;SECONDARY;EVENT=1;MATEID=1_1;CIPOS=0,2;CIEND=0,0;CIPOS95=0,0;CIEND95=0,0;SU=19;SR=19;PRPOS=0.99996,3.98091e-05,1.58483e-09;PREND=1	GT:SU:SR	./.:19:19
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:7043:5583_2	hg38_gold_U07000.1	13	56	hg38_gold_U07000.1	1899	1935	0x1742010	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:19472:2185_2	hg38_gold_U07000.1	0	56	hg38_gold_U07000.1	1898	1945	0x1744270	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:9324:6462_2	hg38_gold_U07000.1	0	51	hg38_gold_U07000.1	1898	1933	0x1748fd0	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:23764:6273_2	hg38_gold_U07000.1	13	48	hg38_gold_U07000.1	1898	1945	0x17490a0	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:7772:5906_2	hg38_gold_U07000.1	0	54	hg38_gold_U07000.1	1898	1942	0x1747240	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:6971:4906_2	hg38_gold_U07000.1	13	51	hg38_gold_U07000.1	1898	1935	0x1746170	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:10511:4776_2	hg38_gold_U07000.1	8	50	hg38_gold_U07000.1	1898	1934	0x174a840	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:19677:3538_2	hg38_gold_U07000.1	13	51	hg38_gold_U07000.1	1898	1940	0x174c190	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:22109:4922_2	hg38_gold_U07000.1	13	53	hg38_gold_U07000.1	1897	1933	0x1743ac0	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:14027:4470_2	hg38_gold_U07000.1	0	51	hg38_gold_U07000.1	1896	1933	0x174b500	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:22736:5607_2	hg38_gold_U07000.1	13	57	hg38_gold_U07000.1	1893	1945	0x174cf30	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:10768:5508_2	hg38_gold_U07000.1	13	56	hg38_gold_U07000.1	1893	1935	0x17480b0	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:14449:3437_2	hg38_gold_U07000.1	13	48	hg38_gold_U07000.1	1893	1933	0x174b150	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:18009:3360_2	hg38_gold_U07000.1	13	47	hg38_gold_U07000.1	1893	1925	0x1749390	0	+	-	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:16615:6513_2	hg38_gold_U07000.1	13	51	hg38_gold_U07000.1	1893	1931	0x174b860	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:19235:5076_2	hg38_gold_U07000.1	13	45	hg38_gold_U07000.1	1893	1932	0x174a790	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:14629:4828_2	hg38_gold_U07000.1	8	50	hg38_gold_U07000.1	1893	1932	0x174b360	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:22780:4994_2	hg38_gold_U07000.1	13	50	hg38_gold_U07000.1	1892	1924	0x174aaf0	0	-	+	id:1	weight:1
+	Evidence:	M00860:26:000000000-A6UGV:1:1101:12387:3929_2	hg38_gold_U07000.1	13	51	hg38_gold_U07000.1	1892	1931	0x174c000	0	-	+	id:1	weight:1
+hg38_gold_U07000.1	10	2_1	N	[hg38_gold_U07000.1:1897[N	.	.	SVTYPE=BND;STRANDS=--:19;EVENT=2;MATEID=2_2;CIPOS=-1,0;CIEND=-7,5;CIPOS95=0,1;CIEND95=-2,1;IMPRECISE;SU=19;SR=19;PRPOS=9.99999e-13,9.99999e-07;PREND=4.89496e-31,1.94872e-23,7.75799e-16,3.08851e-08,0.0308851,0.0775799,0.194872,0.489496,0.194872,0.0122956,1.94872e-09,4.89496e-17,1.22956e-24	GT:SU:SR	./.:19:19
+hg38_gold_U07000.1	1897	2_2	N	[hg38_gold_U07000.1:10[N	.	.	SVTYPE=BND;STRANDS=--:19;SECONDARY;EVENT=2;MATEID=2_1;CIPOS=-7,5;CIEND=-1,0;CIPOS95=-2,1;CIEND95=0,1;IMPRECISE;SU=19;SR=19;PRPOS=4.89496e-31,1.94872e-23,7.75799e-16,3.08851e-08,0.0308851,0.0775799,0.194872,0.489496,0.194872,0.0122956,1.94872e-09,4.89496e-17,1.22956e-24;PREND=9.99999e-13,9.99999e-07	GT:SU:SR	./.:19:19
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_two.vcf	Mon Jul 24 08:03:17 2017 -0400
@@ -0,0 +1,37 @@
+##fileformat=VCFv4.2
+##source=LUMPY
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
+##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
+##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
+##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
+##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
+##INFO=<ID=CIPOS95,Number=2,Type=Integer,Description="Confidence interval (95%) around POS for imprecise variants">
+##INFO=<ID=CIEND95,Number=2,Type=Integer,Description="Confidence interval (95%) around END for imprecise variants">
+##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
+##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
+##INFO=<ID=SECONDARY,Number=0,Type=Flag,Description="Secondary breakend in a multi-line variants">
+##INFO=<ID=SU,Number=.,Type=Integer,Description="Number of pieces of evidence supporting the variant across all samples">
+##INFO=<ID=PE,Number=.,Type=Integer,Description="Number of paired-end reads supporting the variant across all samples">
+##INFO=<ID=SR,Number=.,Type=Integer,Description="Number of split reads supporting the variant across all samples">
+##INFO=<ID=BD,Number=.,Type=Integer,Description="Amount of BED evidence supporting the variant across all samples">
+##INFO=<ID=EV,Number=.,Type=String,Description="Type of LUMPY evidence contributing to the variant call">
+##INFO=<ID=PRPOS,Number=.,Type=String,Description="LUMPY probability curve of the POS breakend">
+##INFO=<ID=PREND,Number=.,Type=String,Description="LUMPY probability curve of the END breakend">
+##ALT=<ID=DEL,Description="Deletion">
+##ALT=<ID=DUP,Description="Duplication">
+##ALT=<ID=INV,Description="Inversion">
+##ALT=<ID=DUP:TANDEM,Description="Tandem duplication">
+##ALT=<ID=INS,Description="Insertion of novel sequence">
+##ALT=<ID=CNV,Description="Copy number variable region">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">
+##FORMAT=<ID=PE,Number=1,Type=Integer,Description="Number of paired-end reads supporting the variant">
+##FORMAT=<ID=SR,Number=1,Type=Integer,Description="Number of split reads supporting the variant">
+##FORMAT=<ID=BD,Number=1,Type=Integer,Description="Amount of BED evidence supporting the variant">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sr_input_bam_a	sr_input_bam_b
+hg38_gold_U07000.1	14	1_1	N	[hg38_gold_U07000.1:1876[N	.	.	SVTYPE=BND;STRANDS=--:38;EVENT=1;MATEID=1_2;CIPOS=0,0;CIEND=0,0;CIPOS95=0,0;CIEND95=0,0;SU=38;SR=38	GT:SU:SR	./.:19:19	./.:19:19
+hg38_gold_U07000.1	1876	1_2	N	[hg38_gold_U07000.1:14[N	.	.	SVTYPE=BND;STRANDS=--:38;SECONDARY;EVENT=1;MATEID=1_1;CIPOS=0,0;CIEND=0,0;CIPOS95=0,0;CIEND95=0,0;SU=38;SR=38	GT:SU:SR	./.:19:19	./.:19:19
+hg38_gold_U07000.1	10	2_1	N	[hg38_gold_U07000.1:1897[N	.	.	SVTYPE=BND;STRANDS=--:38;EVENT=2;MATEID=2_2;CIPOS=0,0;CIEND=-7,5;CIPOS95=0,1;CIEND95=-1,1;IMPRECISE;SU=38;SR=38	GT:SU:SR	./.:19:19	./.:19:19
+hg38_gold_U07000.1	1897	2_2	N	[hg38_gold_U07000.1:10[N	.	.	SVTYPE=BND;STRANDS=--:38;SECONDARY;EVENT=2;MATEID=2_1;CIPOS=-7,5;CIEND=0,0;CIPOS95=-1,1;CIEND95=0,1;IMPRECISE;SU=38;SR=38	GT:SU:SR	./.:19:19	./.:19:19
Binary file test-data/sr.input.bam has changed