# HG changeset patch
# User scisjnu123
# Date 1570113735 14400
# Node ID 0d10255b54347205cbea7d2ee5172e03c2354481
# Parent 2c7824a8d764a16e8e43f5904211a35e064130cf
Uploaded
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/analyze_covariates.xml
--- a/GATK/gatk/analyze_covariates.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/base_recalibrator.xml
--- a/GATK/gatk/base_recalibrator.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/combine_gvcfs.xml
--- a/GATK/gatk/combine_gvcfs.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 0
- --breakBandsAtMultiplesOf $optionals.breakBandsAtMultiplesOf
- #end if
- #end if
-]]>
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/combine_variants.xml
--- a/GATK/gatk/combine_variants.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,96 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/gatk.xml
--- a/GATK/gatk/gatk.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,179 +0,0 @@
-
-
- tool collection Version @VERSION@
-
- gatk_macros.xml
- realigner_target_creator.xml
- indel_realigner.xml
- base_recalibrator.xml
- analyze_covariates.xml
- print_reads.xml
- haplotype_caller.xml
- genotype_gvcfs.xml
- combine_gvcfs.xml
- combine_variants.xml
-
-
-
-
-
-
-
-
-
-
- &1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2
-]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- analysis_type['analysis_type_selector'] == 'RealignerTargetCreator'
-
-
- analysis_type['analysis_type_selector'] == 'IndelRealigner'
-
-
- analysis_type['analysis_type_selector'] == 'BaseRecalibrator'
-
-
- analysis_type['analysis_type_selector'] == 'AnalyzeCovariates'
-
-
- analysis_type['analysis_type_selector'] == 'PrintReads'
-
-
- analysis_type['analysis_type_selector'] == 'HaplotypeCaller'
-
-
- analysis_type['analysis_type_selector'] == 'GenotypeGVCFs'
-
-
- analysis_type['analysis_type_selector'] == 'CombineGVCFs'
-
-
- analysis_type['analysis_type_selector'] == 'CombineVariants'
-
-
-
-
-
- 10.1101/gr.107524.110
- 10.1038/ng.806
- 10.1002/0471250953.bi1110s43
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/gatk_macros.xml
--- a/GATK/gatk/gatk_macros.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,166 +0,0 @@
-
-
-
-
- gatk
- GATK_PATH
- GATK_SITE_OPTIONS
- package_r_for_gatk_3_4_0
-
-
-
-
-
-
-
- 3.4-0
- ${tool.name} - ${analysis_type.analysis_type_selector}
-
- 1:
- THREAD_STRING="-nct $cond_threads.nct" &&
- #end if
- #if int($cond_threads.nt) > 1:
- THREAD_STRING=$THREAD_STRING" -nt $cond_threads.nt" &&
- #end if
- #if int($cond_threads.mem) > 0:
- GATK_MEM=$cond_threads.mem &&
- #end if
- #end if
- java -Xmx\${GATK_MEM:-\${SLURM_MEM_PER_NODE:-4096}}M -jar "\$GATK_PATH/GenomeAnalysisTK.jar" \${THREAD_STRING:-}
-]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/generation/gatk.xsl
--- a/GATK/gatk/generation/gatk.xsl Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,162 +0,0 @@
-
-
-
-
-
-
-
- tool collection Version @VERSION@
-
-
- gatk_macros.xml
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-<![CDATA[
- ############################
- ## import analysis specific preprocessings by using cheetahs internal searchList
- ## if not defined, ignore
- ############################
- #if $analysis_type.analysis_type_selector + "Preprocessing" in vars()['SL'][2]
- #set $analysisPreprocessing = vars()['SL'][2][$analysis_type.analysis_type_selector + "Preprocessing"]
- #include source=$analysisPreprocessing
- #end if
-
- ############################
- ## GATK tool unspecific options
- ############################
- @GATK_EXEC@
-
- --analysis_type ${analysis_type.analysis_type_selector}
- --reference_sequence ${ref_file.fields.path}
-
- --log_to_file ${output_log}
-
- #if $cond_intervals.cond_intervals_enabled
- #for $interval in $cond_intervals.intervals:
- --intervals ${interval.L}
- #end for
- #end if
-
- #if $cond_BQSR.cond_BQSR_enabled
- --BQSR $cond_BQSR.BQSR
- #end if
-
- ############################
- ## import analysis specific options by using cheetahs internal searchList
- ## if not defined throw raw python error until better idea
- ############################
- #if $analysis_type.analysis_type_selector + "Options" in vars()['SL'][2]
- #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"]
- #include source=$analysisOptions
- #else
- #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"]
- #end if
-
- ############################
- ## only put ERROR or FATAL log messages into stderr
- ## but keep full log for printing into log file
- ############################
- 2>&1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2
-]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- analysis_type['analysis_type_selector'] == ''
-
-
-
-
-
-
-
-
- 10.1101/gr.107524.110
- 10.1038/ng.806
- 10.1002/0471250953.bi1110s43
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/generation/gatk.xsldb.xml
--- a/GATK/gatk/generation/gatk.xsldb.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,57 +0,0 @@
-
-
-
- RealignerTargetCreator
- bam
- rtc
- realigner_target_creator.xml
-
-
- IndelRealigner
- bam
- ir
- indel_realigner.xml
-
-
- BaseRecalibrator
- bam
- br
- base_recalibrator.xml
-
-
- AnalyzeCovariates
- bam
- ac
- analyze_covariates.xml
-
-
- PrintReads
- bam
- pr
- print_reads.xml
-
-
- HaplotypeCaller
- bam
- hc
- haplotype_caller.xml
-
-
- GenotypeGVCFs
- gvcf
- gg
- genotype_gvcfs.xml
-
-
- CombineGVCFs
- gvcf
- cg
- combine_gvcfs.xml
-
-
- CombineVariants
- vcf
- cv
- combine_variants.xml
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/genotype_gvcfs.xml
--- a/GATK/gatk/genotype_gvcfs.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/haplotype_caller.xml
--- a/GATK/gatk/haplotype_caller.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,65 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/indel_realigner.xml
--- a/GATK/gatk/indel_realigner.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,90 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/print_reads.xml
--- a/GATK/gatk/print_reads.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,71 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 0
- --number $optionals.number
- #end if
- #if str($optionals.platform)
- --platform $optionals.platform
- #end if
- #if str($optionals.readGroup)
- --readGroup $optionals.readGroup
- #end if
- #if $optionals.sample_file
- --sample_file $optionals.sample_file
- #end if
- #if $optionals.sample_names
- #for $sample in $optionals.sample_names:
- --intervals ${sample.sample_name}
- #end for
- #end if
- $optionals.simplify
- #end if
-]]>
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/realigner_target_creator.xml
--- a/GATK/gatk/realigner_target_creator.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,57 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/tool-data/destinations.py
--- a/GATK/gatk/tool-data/destinations.py Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,62 +0,0 @@
-from galaxy.jobs import JobDestination
-import os
-import sys
-import json
-import cStringIO
-import logging
-
-log = logging.getLogger( __name__ )
-
-
-def dump(obj, nested_level=0, output=sys.stdout):
- spacing = ' '
- if type(obj) == dict:
- print >> output, '%s{' % ((nested_level) * spacing)
- for k, v in obj.items():
- if hasattr(v, '__iter__'):
- print >> output, '%s%s:' % ((nested_level + 1) * spacing, k)
- dump(v, nested_level + 1, output)
- else:
- print >> output, '%s%s: %s' % ((nested_level + 1) * spacing, k, v)
- print >> output, '%s}' % (nested_level * spacing)
- elif type(obj) == list:
- print >> output, '%s[' % ((nested_level) * spacing)
- for v in obj:
- if hasattr(v, '__iter__'):
- dump(v, nested_level + 1, output)
- else:
- print >> output, '%s%s' % ((nested_level + 1) * spacing, v)
- print >> output, '%s]' % ((nested_level) * spacing)
- else:
- print >> output, '%s%s' % (nested_level * spacing, obj)
-
-
-def dynamic_slurm_cluster_gatk(job, tool_id):
- # Allocate extra time
- inp_data = dict( [ ( da.name, da.dataset ) for da in job.input_datasets ] )
- inp_data.update( [ ( da.name, da.dataset ) for da in job.input_library_datasets ] )
- inp_data.update( [ ( da.name, json.loads(da.value) ) for da in job.parameters ] )
- out = cStringIO.StringIO()
- dump(inp_data, 1, out)
- log.debug(out.getvalue())
-
- nativeSpecs = '--nodes=1 --ntasks=1'
-
- # runner doesn't allow to specify --cpus-per-task
- # thus the mem calculation gets messy with more than 1 node
- # --> translate nt ==> nodes, nct ==> ntasks
-
- if 'cond_threads' not in inp_data:
- return JobDestination(runner="slurm")
-
- if inp_data['cond_threads']['cond_threads_enabled'] == "True":
- nNodes = int(inp_data['cond_threads']['nt'])
- nCPU = int(inp_data['cond_threads']['nct'])
- nMEM = int(inp_data['cond_threads']['mem'])
- if nMEM > 0:
- nativeSpecs = '--nodes=%d --ntasks=%d --mem=%d' % (nNodes, nCPU*nNodes, nMEM)
- else:
- nativeSpecs = '--nodes=%d --ntasks=%d' % (nNodes, nCPU*nNodes)
-
- return JobDestination(runner="slurm", params={"nativeSpecification": nativeSpecs})
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/tool-data/picard_index.loc.sample
--- a/GATK/gatk/tool-data/picard_index.loc.sample Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-#This is a sample file distributed with Galaxy that enables tools
-#to use a directory of Picard dict and associated files. You will need
-#to create these data files and then create a picard_index.loc file
-#similar to this one (store it in this directory) that points to
-#the directories in which those files are stored. The picard_index.loc
-#file has this format (longer white space is the TAB character):
-#
-#
-#
-#So, for example, if you had hg18 indexed and stored in
-#/depot/data2/galaxy/srma/hg18/,
-#then the srma_index.loc entry would look like this:
-#
-#hg18 hg18 hg18 Pretty /depot/data2/galaxy/picard/hg18/hg18.fa
-#
-#and your /depot/data2/galaxy/srma/hg18/ directory
-#would contain the following three files:
-#hg18.fa
-#hg18.dict
-#hg18.fa.fai
-#
-#The dictionary file for each reference (ex. hg18.dict) must be
-#created via Picard (http://picard.sourceforge.net). Note that
-#the dict file does not have the .fa extension although the
-#path list in the loc file does include it.
-#
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/tool_data_table_conf.xml.sample
--- a/GATK/gatk/tool_data_table_conf.xml.sample Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-
-
-
- value, dbkey, name, path
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/gatk/tool_dependencies.xml
--- a/GATK/gatk/tool_dependencies.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-
-
-
- /mnt/galaxy/tools/GATK/3.4-0
-
-
-
-
-
-
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/package_picard_1_135/tool_dependencies.xml
--- a/GATK/package_picard_1_135/tool_dependencies.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,22 +0,0 @@
-
-
-
-
-
-
- https://github.com/broadinstitute/picard/releases/download/1.135/picard-tools-1.135.zip
-
- .
- $INSTALL_DIR
-
-
-
- $INSTALL_DIR
-
-
-
-
-This picard package dependency is retrieved directly from https://github.com/broadinstitute/picard/releases
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 GATK/package_r_for_gatk_3_4_0/tool_dependencies.xml
--- a/GATK/package_r_for_gatk_3_4_0/tool_dependencies.xml Thu Sep 12 06:50:21 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,48 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
- https://github.com/cran/stringi/archive/0.5-5.tar.gz
- https://github.com/cran/magrittr/archive/1.5.tar.gz
- https://github.com/cran/stringr/archive/1.0.0.tar.gz
- https://github.com/cran/RColorBrewer/archive/1.1-2.tar.gz
- https://github.com/cran/dichromat/archive/2.0-0.tar.gz
- https://github.com/cran/colorspace/archive/1.2-6.tar.gz
- https://github.com/cran/munsell/archive/0.4.2.tar.gz
- https://github.com/cran/labeling/archive/0.3.tar.gz
- https://github.com/cran/Rcpp/archive/0.11.6.tar.gz
- https://github.com/cran/digest/archive/0.6.8.tar.gz
- https://github.com/cran/gtable/archive/0.1.2.tar.gz
- https://github.com/cran/bitops/archive/1.0-6.tar.gz
- https://github.com/cran/caTools/archive/1.17.1.tar.gz
- https://github.com/cran/gtools/archive/3.5.0.tar.gz
- https://github.com/cran/gdata/archive/2.17.0.tar.gz
- https://github.com/cran/gsalib/archive/2.1.tar.gz
- https://github.com/cran/gplots/archive/2.17.0.tar.gz
- https://github.com/cran/plyr/archive/1.8.3.tar.gz
- https://github.com/cran/reshape/archive/0.8.5.tar.gz
- https://github.com/cran/reshape2/archive/1.4.1.tar.gz
- https://github.com/cran/scales/archive/0.2.5.tar.gz
- https://github.com/cran/proto/archive/0.3-10.tar.gz
- https://github.com/cran/MASS/archive/7.3-43.tar.gz
- https://github.com/cran/ggplot2/archive/1.0.1.tar.gz
-
-
-
-
- ggplot2 is a plotting system for R, based on the grammar of graphics, which tries to take the good parts of base and lattice graphics and none of the bad parts.
- It takes care of many of the fiddly details that make plotting a hassle (like drawing legends) as well as providing a powerful model of graphics that makes it easy to produce complex multi-layered graphics.
-
- http://ggplot2.org/
-
-
-
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/Haplotypecaller.ga
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/Haplotypecaller.ga Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,101 @@
+{
+ "a_galaxy_workflow": "true",
+ "annotation": "Haplotypecaller workflow.",
+ "format-version": "0.1",
+ "name": "Haplotypecaller",
+ "steps": {
+ "0": {
+ "annotation": "",
+ "id": 0,
+ "input_connections": {},
+ "inputs": [
+ {
+ "description": "",
+ "name": "Input Dataset"
+ }
+ ],
+ "label": null,
+ "name": "Input dataset",
+ "outputs": [],
+ "position": {
+ "left": 198,
+ "top": 208.1666717529297
+ },
+ "tool_errors": null,
+ "tool_id": null,
+ "tool_state": "{\"name\": \"Input Dataset\"}",
+ "tool_version": null,
+ "type": "data_input",
+ "user_outputs": [],
+ "uuid": "b9fb671f-f07f-498c-8c85-619987501313"
+ },
+ "1": {
+ "annotation": "",
+ "id": 1,
+ "input_connections": {
+ "analysis_type|cond_bam_input|input": {
+ "id": 0,
+ "output_name": "output"
+ }
+ },
+ "inputs": [],
+ "label": null,
+ "name": "GATK",
+ "outputs": [
+ {
+ "name": "rtc_output_intervals",
+ "type": "gatk_interval"
+ },
+ {
+ "name": "ir_output_bam",
+ "type": "bam"
+ },
+ {
+ "name": "br_table",
+ "type": "tabular"
+ },
+ {
+ "name": "ac_plotsReportFile",
+ "type": "pdf"
+ },
+ {
+ "name": "pr_output_bam",
+ "type": "bam"
+ },
+ {
+ "name": "hc_output_gvcf",
+ "type": "vcf"
+ },
+ {
+ "name": "gg_output_gvcf",
+ "type": "vcf"
+ },
+ {
+ "name": "cg_output_vcf",
+ "type": "vcf"
+ },
+ {
+ "name": "cv_output_vcf",
+ "type": "vcf"
+ },
+ {
+ "name": "output_log",
+ "type": "txt"
+ }
+ ],
+ "position": {
+ "left": 484.83331298828125,
+ "top": 200
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/avowinkel/gatk/gatk/3.4-0.d9",
+ "tool_state": "{\"__page__\": 0, \"cond_threads\": \"{\\\"cond_threads_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\", \"ref_file\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"\\\"}\", \"__rerun_remap_job_id__\": null, \"cond_BQSR\": \"{\\\"cond_BQSR_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\", \"analysis_type\": \"{\\\"analysis_type_selector\\\": \\\"HaplotypeCaller\\\", \\\"cond_usage\\\": {\\\"emitRefConfidence\\\": \\\"GVCF\\\", \\\"__current_case__\\\": 0, \\\"cond_usage_selector\\\": \\\"GVCF\\\"}, \\\"optional_parameters\\\": {\\\"optional_parameters_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}, \\\"cond_bam_input\\\": {\\\"input\\\": null, \\\"all_in_one\\\": \\\"False\\\", \\\"__current_case__\\\": 1}, \\\"__current_case__\\\": 5}\", \"cond_intervals\": \"{\\\"cond_intervals_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\"}",
+ "tool_version": "3.4-0.d9",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "5988799c-5658-4475-89ee-86f267cfbee1"
+ }
+ },
+ "uuid": "e4f99de3-d6b0-4554-9562-60b2b0eed2d3"
+}
\ No newline at end of file
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/SAMTools.ga
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/SAMTools.ga Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,160 @@
+{
+ "a_galaxy_workflow": "true",
+ "annotation": "SAMTools workflow",
+ "format-version": "0.1",
+ "name": "SAMTools",
+ "steps": {
+ "0": {
+ "annotation": "",
+ "id": 0,
+ "input_connections": {},
+ "inputs": [
+ {
+ "description": "",
+ "name": "Input Dataset"
+ }
+ ],
+ "label": null,
+ "name": "Input dataset",
+ "outputs": [],
+ "position": {
+ "left": 162,
+ "top": 169
+ },
+ "tool_errors": null,
+ "tool_id": null,
+ "tool_state": "{\"name\": \"Input Dataset\"}",
+ "tool_version": null,
+ "type": "data_input",
+ "user_outputs": [],
+ "uuid": "1eec712c-2fe0-43a8-b033-f027a74c6a48"
+ },
+ "1": {
+ "annotation": "",
+ "id": 1,
+ "input_connections": {},
+ "inputs": [
+ {
+ "description": "",
+ "name": "Input Dataset"
+ }
+ ],
+ "label": null,
+ "name": "Input dataset",
+ "outputs": [],
+ "position": {
+ "left": 273.83331298828125,
+ "top": 341.83331298828125
+ },
+ "tool_errors": null,
+ "tool_id": null,
+ "tool_state": "{\"name\": \"Input Dataset\"}",
+ "tool_version": null,
+ "type": "data_input",
+ "user_outputs": [],
+ "uuid": "98d5e3bf-4715-48a2-9b2b-29b21179c425"
+ },
+ "2": {
+ "annotation": "",
+ "id": 2,
+ "input_connections": {
+ "input1": {
+ "id": 0,
+ "output_name": "output"
+ }
+ },
+ "inputs": [],
+ "label": null,
+ "name": "Sort",
+ "outputs": [
+ {
+ "name": "output1",
+ "type": "bam"
+ }
+ ],
+ "position": {
+ "left": 330,
+ "top": 222
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/samtools_sort/samtools_sort/2.0",
+ "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"input1\": \"null\", \"sort_mode\": \"\\\"\\\"\"}",
+ "tool_version": "2.0",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "5e9aef3e-c2ad-43d1-b248-7d94d6a974bc"
+ },
+ "3": {
+ "annotation": "",
+ "id": 3,
+ "input_connections": {
+ "reference_source|input_bams_0|input_bam": {
+ "id": 2,
+ "output_name": "output1"
+ },
+ "reference_source|ref_file": {
+ "id": 1,
+ "output_name": "output"
+ }
+ },
+ "inputs": [],
+ "label": null,
+ "name": "MPileup",
+ "outputs": [
+ {
+ "name": "output_mpileup",
+ "type": "pileup"
+ },
+ {
+ "name": "output_log",
+ "type": "txt"
+ }
+ ],
+ "position": {
+ "left": 484.83331298828125,
+ "top": 313.83331298828125
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/samtools_mpileup/samtools_mpileup/2.0",
+ "tool_state": "{\"__page__\": 0, \"advanced_options\": \"{\\\"advanced_options_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 1}\", \"__rerun_remap_job_id__\": null, \"genotype_likelihood_computation_type\": \"{\\\"output_mapping_quality\\\": \\\"False\\\", \\\"genotype_likelihood_computation_type_selector\\\": \\\"do_not_perform_genotype_likelihood_computation\\\", \\\"__current_case__\\\": 1, \\\"base_position_on_reads\\\": \\\"False\\\"}\", \"reference_source\": \"{\\\"ref_file\\\": null, \\\"reference_source_selector\\\": \\\"history\\\", \\\"input_bams\\\": [{\\\"__index__\\\": 0, \\\"input_bam\\\": null}], \\\"__current_case__\\\": 1}\"}",
+ "tool_version": "2.0",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "84c6d18e-1930-4eef-b58a-46c88a35339e"
+ },
+ "4": {
+ "annotation": "",
+ "id": 4,
+ "input_connections": {
+ "input_file": {
+ "id": 3,
+ "output_name": "output_mpileup"
+ }
+ },
+ "inputs": [],
+ "label": null,
+ "name": "Pileup to VCF",
+ "outputs": [
+ {
+ "name": "output_file",
+ "type": "vcf"
+ }
+ ],
+ "position": {
+ "left": 562.8333129882812,
+ "top": 469.83331298828125
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/jjohnson/pileup_to_vcf/pileup_to_vcf/2.2",
+ "tool_state": "{\"snps_only\": \"\\\"False\\\"\", \"min_cvrg\": \"\\\"5\\\"\", \"allow_multiples\": \"\\\"False\\\"\", \"input_file\": \"null\", \"__page__\": 0, \"vcf_id\": \"\\\"\\\"\", \"__rerun_remap_job_id__\": null, \"cols\": \"{\\\"select_order\\\": \\\"no\\\", \\\"__current_case__\\\": 0}\", \"depth_as\": \"\\\"ref\\\"\", \"min_base_qual\": \"\\\"20\\\"\", \"min_var_pct\": \"\\\"0.5\\\"\"}",
+ "tool_version": "2.2",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "335581bc-838e-431d-8247-1e1580260275"
+ }
+ },
+ "uuid": "0c40a63d-b349-49b8-90b0-c927a05253ef"
+}
\ No newline at end of file
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/SVDetect.ga
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/SVDetect.ga Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,163 @@
+{
+ "a_galaxy_workflow": "true",
+ "annotation": "SVDetect workflow.",
+ "format-version": "0.1",
+ "name": "SVDetect",
+ "steps": {
+ "0": {
+ "annotation": "",
+ "id": 0,
+ "input_connections": {},
+ "inputs": [],
+ "label": null,
+ "name": "Import data",
+ "outputs": [
+ {
+ "name": "outbamfile",
+ "type": "bam"
+ },
+ {
+ "name": "outlenfile",
+ "type": "len"
+ },
+ {
+ "name": "outsvfile",
+ "type": "sv"
+ }
+ ],
+ "position": {
+ "left": 171.99998474121094,
+ "top": 168
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_import/1.0.0",
+ "tool_state": "{\"file_name\": \"\\\"file1\\\"\", \"__rerun_remap_job_id__\": null, \"type\": \"{\\\"file_type\\\": \\\"bam\\\", \\\"__current_case__\\\": 0}\", \"file_path\": \"\\\"\\\"\", \"__page__\": 0}",
+ "tool_version": "1.0.0",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "296478bf-9b67-47e8-9685-ced05bad2857"
+ },
+ "1": {
+ "annotation": "",
+ "id": 1,
+ "input_connections": {},
+ "inputs": [],
+ "label": null,
+ "name": "Import data",
+ "outputs": [
+ {
+ "name": "outbamfile",
+ "type": "bam"
+ },
+ {
+ "name": "outlenfile",
+ "type": "len"
+ },
+ {
+ "name": "outsvfile",
+ "type": "sv"
+ }
+ ],
+ "position": {
+ "left": 266.99998474121094,
+ "top": 430.99998474121094
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_import/1.0.0",
+ "tool_state": "{\"file_name\": \"\\\"file1\\\"\", \"__rerun_remap_job_id__\": null, \"type\": \"{\\\"file_type\\\": \\\"len\\\", \\\"__current_case__\\\": 1}\", \"file_path\": \"\\\"\\\"\", \"__page__\": 0}",
+ "tool_version": "1.0.0",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "481cf408-cef1-4ccd-bd08-9394bac04050"
+ },
+ "2": {
+ "annotation": "",
+ "id": 2,
+ "input_connections": {
+ "inputBam": {
+ "id": 0,
+ "output_name": "outbamfile"
+ }
+ },
+ "inputs": [],
+ "label": null,
+ "name": "BAM preprocessing",
+ "outputs": [
+ {
+ "name": "abBAM",
+ "type": "bam"
+ },
+ {
+ "name": "log",
+ "type": "txt"
+ },
+ {
+ "name": "normBAM",
+ "type": "bam"
+ }
+ ],
+ "position": {
+ "left": 344.49998474121094,
+ "top": 224
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_preprocessing/1.0.0",
+ "tool_state": "{\"__page__\": 0, \"isizeMin\": \"\\\"0\\\"\", \"newBam\": \"{\\\"__current_case__\\\": 1, \\\"pairNormal\\\": \\\"no\\\"}\", \"isizeMax\": \"\\\"10000\\\"\", \"inputBam\": \"null\", \"__rerun_remap_job_id__\": null, \"nbrePair\": \"\\\"1000000\\\"\", \"readType\": \"\\\"1\\\"\", \"sample_name\": \"\\\"sample\\\"\", \"pairType\": \"\\\"1\\\"\", \"foldPair\": \"\\\"3.0\\\"\"}",
+ "tool_version": "1.0.0",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "01f2c8d8-123b-4eac-9c5e-6101e9264dbf"
+ },
+ "3": {
+ "annotation": "",
+ "id": 3,
+ "input_connections": {
+ "cmap_file": {
+ "id": 1,
+ "output_name": "outlenfile"
+ },
+ "mates_file": {
+ "id": 2,
+ "output_name": "abBAM"
+ }
+ },
+ "inputs": [],
+ "label": null,
+ "name": "Detect clusters of anomalously mapped pairs",
+ "outputs": [
+ {
+ "name": "sv_file",
+ "type": "sv"
+ },
+ {
+ "name": "circos_file",
+ "type": "segdup"
+ },
+ {
+ "name": "bed_file",
+ "type": "bed"
+ },
+ {
+ "name": "log_file",
+ "type": "txt"
+ }
+ ],
+ "position": {
+ "left": 472.49998474121094,
+ "top": 375.99998474121094
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_run_parallel/1.0.0",
+ "tool_state": "{\"__page__\": 0, \"sv_type\": \"\\\"all\\\"\", \"getLinks\": \"{\\\"window_size\\\": \\\"3000\\\", \\\"step_length\\\": \\\"250\\\", \\\"splitmate\\\": \\\"True\\\", \\\"__current_case__\\\": 1, \\\"linking\\\": \\\"linking\\\"}\", \"mates_file\": \"null\", \"__rerun_remap_job_id__\": null, \"getFilteredLinks\": \"{\\\"nb_pairs_threshold\\\": \\\"5\\\", \\\"filtering\\\": \\\"filtering\\\", \\\"chromosomes\\\": \\\"\\\", \\\"file_conversion\\\": {\\\"file_conversion_select\\\": \\\"do_not_convert\\\", \\\"__current_case__\\\": 0}, \\\"splitlink\\\": \\\"False\\\", \\\"filter1\\\": {\\\"strand_filtering\\\": \\\"strand\\\", \\\"final_score_threshold\\\": \\\"1.0\\\", \\\"__current_case__\\\": 1, \\\"filter2\\\": {\\\"nb_pairs_order_threshold\\\": \\\"2\\\", \\\"order_filtering\\\": \\\"order\\\", \\\"__current_case__\\\": 1, \\\"filter3\\\": {\\\"indel_sigma_threshold\\\": \\\"3.0\\\", \\\"dup_sigma_threshold\\\": \\\"3.0\\\", \\\"singleton_sigma_threshold\\\": \\\"4.0\\\", \\\"__current_case__\\\": 1, \\\"insert_size_filtering\\\": \\\"insert\\\"}, \\\"mu_length\\\": \\\"3000\\\", \\\"sigma_length\\\": \\\"250\\\"}}, \\\"__current_case__\\\": 1, \\\"links2SV\\\": \\\"True\\\"}\", \"read2_length\": \"\\\"50\\\"\", \"read1_length\": \"\\\"50\\\"\", \"sample_name\": \"\\\"sample\\\"\", \"mates_orientation\": \"\\\"FR\\\"\", \"cmap_file\": \"null\"}",
+ "tool_version": "1.0.0",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "2edd1fca-fc2d-453e-aead-c95a2e250d0a"
+ }
+ },
+ "uuid": "bb58fd42-403a-48ad-80d2-59d81f0c8f0b"
+}
\ No newline at end of file
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/VARSCAN.ga
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/VARSCAN.ga Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,133 @@
+{
+ "a_galaxy_workflow": "true",
+ "annotation": "VARSCAN workflow.",
+ "format-version": "0.1",
+ "name": "VARSCAN",
+ "steps": {
+ "0": {
+ "annotation": "",
+ "id": 0,
+ "input_connections": {},
+ "inputs": [
+ {
+ "description": "",
+ "name": "Input Dataset"
+ }
+ ],
+ "label": null,
+ "name": "Input dataset",
+ "outputs": [],
+ "position": {
+ "left": 161,
+ "top": 187
+ },
+ "tool_errors": null,
+ "tool_id": null,
+ "tool_state": "{\"name\": \"Input Dataset\"}",
+ "tool_version": null,
+ "type": "data_input",
+ "user_outputs": [],
+ "uuid": "72a1326a-cd57-4567-8c41-cc66411642be"
+ },
+ "1": {
+ "annotation": "",
+ "id": 1,
+ "input_connections": {},
+ "inputs": [
+ {
+ "description": "",
+ "name": "Input Dataset"
+ }
+ ],
+ "label": null,
+ "name": "Input dataset",
+ "outputs": [],
+ "position": {
+ "left": 163.83331298828125,
+ "top": 263.8333282470703
+ },
+ "tool_errors": null,
+ "tool_id": null,
+ "tool_state": "{\"name\": \"Input Dataset\"}",
+ "tool_version": null,
+ "type": "data_input",
+ "user_outputs": [],
+ "uuid": "d8444634-af6c-44c5-9b55-8c3284da2823"
+ },
+ "2": {
+ "annotation": "",
+ "id": 2,
+ "input_connections": {
+ "reference_source|input_bams_0|input_bam": {
+ "id": 0,
+ "output_name": "output"
+ },
+ "reference_source|ref_file": {
+ "id": 1,
+ "output_name": "output"
+ }
+ },
+ "inputs": [],
+ "label": null,
+ "name": "MPileup",
+ "outputs": [
+ {
+ "name": "output_mpileup",
+ "type": "pileup"
+ },
+ {
+ "name": "output_log",
+ "type": "txt"
+ }
+ ],
+ "position": {
+ "left": 370,
+ "top": 187
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/samtools_mpileup/samtools_mpileup/2.0",
+ "tool_state": "{\"__page__\": 0, \"advanced_options\": \"{\\\"advanced_options_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 1}\", \"__rerun_remap_job_id__\": null, \"genotype_likelihood_computation_type\": \"{\\\"output_mapping_quality\\\": \\\"False\\\", \\\"genotype_likelihood_computation_type_selector\\\": \\\"do_not_perform_genotype_likelihood_computation\\\", \\\"__current_case__\\\": 1, \\\"base_position_on_reads\\\": \\\"False\\\"}\", \"reference_source\": \"{\\\"ref_file\\\": null, \\\"reference_source_selector\\\": \\\"history\\\", \\\"input_bams\\\": [{\\\"__index__\\\": 0, \\\"input_bam\\\": null}], \\\"__current_case__\\\": 1}\"}",
+ "tool_version": "2.0",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "26d34295-7783-40e1-950f-a485ff569f5b"
+ },
+ "3": {
+ "annotation": "",
+ "id": 3,
+ "input_connections": {
+ "in_file": {
+ "id": 2,
+ "output_name": "output_mpileup"
+ }
+ },
+ "inputs": [],
+ "label": null,
+ "name": "VarScan mpileup",
+ "outputs": [
+ {
+ "name": "output",
+ "type": "vcf"
+ },
+ {
+ "name": "log",
+ "type": "txt"
+ }
+ ],
+ "position": {
+ "left": 563.8333129882812,
+ "top": 364.83331298828125
+ },
+ "post_job_actions": {},
+ "tool_errors": null,
+ "tool_id": "toolshed.g2.bx.psu.edu/repos/fcaramia/varscan/varscan_mpileup/2.3.5",
+ "tool_state": "{\"strand_filter\": \"\\\"1\\\"\", \"min_avg_qual\": \"\\\"15\\\"\", \"min_coverage\": \"\\\"8\\\"\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"min_freq_for_hom\": \"\\\"0.75\\\"\", \"min_var_freq\": \"\\\"0.01\\\"\", \"exe_command\": \"\\\"mpileup2snp\\\"\", \"min_reads2\": \"\\\"2\\\"\", \"p_value\": \"\\\"0.99\\\"\", \"variants\": \"\\\"1\\\"\", \"vcf_sample_list\": \"null\", \"in_file\": \"null\"}",
+ "tool_version": "2.3.5",
+ "type": "tool",
+ "user_outputs": [],
+ "uuid": "5ff6cf71-5d74-42ad-bcdc-7c75891dc10e"
+ }
+ },
+ "uuid": "5b7bf29e-1272-4211-b345-37695989c6fa"
+}
\ No newline at end of file
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/analyze_covariates.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/analyze_covariates.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/base_recalibrator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/base_recalibrator.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/combine_gvcfs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/combine_gvcfs.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,44 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0
+ --breakBandsAtMultiplesOf $optionals.breakBandsAtMultiplesOf
+ #end if
+ #end if
+]]>
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/combine_variants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/combine_variants.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,96 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/gatk.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/gatk.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,179 @@
+
+
+ tool collection Version @VERSION@
+
+ gatk_macros.xml
+ realigner_target_creator.xml
+ indel_realigner.xml
+ base_recalibrator.xml
+ analyze_covariates.xml
+ print_reads.xml
+ haplotype_caller.xml
+ genotype_gvcfs.xml
+ combine_gvcfs.xml
+ combine_variants.xml
+
+
+
+
+
+
+
+
+
+
+ &1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ analysis_type['analysis_type_selector'] == 'RealignerTargetCreator'
+
+
+ analysis_type['analysis_type_selector'] == 'IndelRealigner'
+
+
+ analysis_type['analysis_type_selector'] == 'BaseRecalibrator'
+
+
+ analysis_type['analysis_type_selector'] == 'AnalyzeCovariates'
+
+
+ analysis_type['analysis_type_selector'] == 'PrintReads'
+
+
+ analysis_type['analysis_type_selector'] == 'HaplotypeCaller'
+
+
+ analysis_type['analysis_type_selector'] == 'GenotypeGVCFs'
+
+
+ analysis_type['analysis_type_selector'] == 'CombineGVCFs'
+
+
+ analysis_type['analysis_type_selector'] == 'CombineVariants'
+
+
+
+
+
+ 10.1101/gr.107524.110
+ 10.1038/ng.806
+ 10.1002/0471250953.bi1110s43
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/gatk_macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/gatk_macros.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,166 @@
+
+
+
+
+ gatk
+ GATK_PATH
+ GATK_SITE_OPTIONS
+ package_r_for_gatk_3_4_0
+
+
+
+
+
+
+
+ 3.4-0
+ ${tool.name} - ${analysis_type.analysis_type_selector}
+
+ 1:
+ THREAD_STRING="-nct $cond_threads.nct" &&
+ #end if
+ #if int($cond_threads.nt) > 1:
+ THREAD_STRING=$THREAD_STRING" -nt $cond_threads.nt" &&
+ #end if
+ #if int($cond_threads.mem) > 0:
+ GATK_MEM=$cond_threads.mem &&
+ #end if
+ #end if
+ java -Xmx\${GATK_MEM:-\${SLURM_MEM_PER_NODE:-4096}}M -jar "\$GATK_PATH/GenomeAnalysisTK.jar" \${THREAD_STRING:-}
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/generation/gatk.xsl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/generation/gatk.xsl Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,162 @@
+
+
+
+
+
+
+
+ tool collection Version @VERSION@
+
+
+ gatk_macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<![CDATA[
+ ############################
+ ## import analysis specific preprocessings by using cheetahs internal searchList
+ ## if not defined, ignore
+ ############################
+ #if $analysis_type.analysis_type_selector + "Preprocessing" in vars()['SL'][2]
+ #set $analysisPreprocessing = vars()['SL'][2][$analysis_type.analysis_type_selector + "Preprocessing"]
+ #include source=$analysisPreprocessing
+ #end if
+
+ ############################
+ ## GATK tool unspecific options
+ ############################
+ @GATK_EXEC@
+
+ --analysis_type ${analysis_type.analysis_type_selector}
+ --reference_sequence ${ref_file.fields.path}
+
+ --log_to_file ${output_log}
+
+ #if $cond_intervals.cond_intervals_enabled
+ #for $interval in $cond_intervals.intervals:
+ --intervals ${interval.L}
+ #end for
+ #end if
+
+ #if $cond_BQSR.cond_BQSR_enabled
+ --BQSR $cond_BQSR.BQSR
+ #end if
+
+ ############################
+ ## import analysis specific options by using cheetahs internal searchList
+ ## if not defined throw raw python error until better idea
+ ############################
+ #if $analysis_type.analysis_type_selector + "Options" in vars()['SL'][2]
+ #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"]
+ #include source=$analysisOptions
+ #else
+ #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"]
+ #end if
+
+ ############################
+ ## only put ERROR or FATAL log messages into stderr
+ ## but keep full log for printing into log file
+ ############################
+ 2>&1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ analysis_type['analysis_type_selector'] == ''
+
+
+
+
+
+
+
+
+ 10.1101/gr.107524.110
+ 10.1038/ng.806
+ 10.1002/0471250953.bi1110s43
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/generation/gatk.xsldb.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/generation/gatk.xsldb.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,57 @@
+
+
+
+ RealignerTargetCreator
+ bam
+ rtc
+ realigner_target_creator.xml
+
+
+ IndelRealigner
+ bam
+ ir
+ indel_realigner.xml
+
+
+ BaseRecalibrator
+ bam
+ br
+ base_recalibrator.xml
+
+
+ AnalyzeCovariates
+ bam
+ ac
+ analyze_covariates.xml
+
+
+ PrintReads
+ bam
+ pr
+ print_reads.xml
+
+
+ HaplotypeCaller
+ bam
+ hc
+ haplotype_caller.xml
+
+
+ GenotypeGVCFs
+ gvcf
+ gg
+ genotype_gvcfs.xml
+
+
+ CombineGVCFs
+ gvcf
+ cg
+ combine_gvcfs.xml
+
+
+ CombineVariants
+ vcf
+ cv
+ combine_variants.xml
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/genotype_gvcfs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/genotype_gvcfs.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,43 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/haplotype_caller.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/haplotype_caller.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,65 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/indel_realigner.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/indel_realigner.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,90 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/print_reads.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/print_reads.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,71 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 0
+ --number $optionals.number
+ #end if
+ #if str($optionals.platform)
+ --platform $optionals.platform
+ #end if
+ #if str($optionals.readGroup)
+ --readGroup $optionals.readGroup
+ #end if
+ #if $optionals.sample_file
+ --sample_file $optionals.sample_file
+ #end if
+ #if $optionals.sample_names
+ #for $sample in $optionals.sample_names:
+ --intervals ${sample.sample_name}
+ #end for
+ #end if
+ $optionals.simplify
+ #end if
+]]>
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/realigner_target_creator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/realigner_target_creator.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,57 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/tool-data/destinations.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/tool-data/destinations.py Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,62 @@
+from galaxy.jobs import JobDestination
+import os
+import sys
+import json
+import cStringIO
+import logging
+
+log = logging.getLogger( __name__ )
+
+
+def dump(obj, nested_level=0, output=sys.stdout):
+ spacing = ' '
+ if type(obj) == dict:
+ print >> output, '%s{' % ((nested_level) * spacing)
+ for k, v in obj.items():
+ if hasattr(v, '__iter__'):
+ print >> output, '%s%s:' % ((nested_level + 1) * spacing, k)
+ dump(v, nested_level + 1, output)
+ else:
+ print >> output, '%s%s: %s' % ((nested_level + 1) * spacing, k, v)
+ print >> output, '%s}' % (nested_level * spacing)
+ elif type(obj) == list:
+ print >> output, '%s[' % ((nested_level) * spacing)
+ for v in obj:
+ if hasattr(v, '__iter__'):
+ dump(v, nested_level + 1, output)
+ else:
+ print >> output, '%s%s' % ((nested_level + 1) * spacing, v)
+ print >> output, '%s]' % ((nested_level) * spacing)
+ else:
+ print >> output, '%s%s' % (nested_level * spacing, obj)
+
+
+def dynamic_slurm_cluster_gatk(job, tool_id):
+ # Allocate extra time
+ inp_data = dict( [ ( da.name, da.dataset ) for da in job.input_datasets ] )
+ inp_data.update( [ ( da.name, da.dataset ) for da in job.input_library_datasets ] )
+ inp_data.update( [ ( da.name, json.loads(da.value) ) for da in job.parameters ] )
+ out = cStringIO.StringIO()
+ dump(inp_data, 1, out)
+ log.debug(out.getvalue())
+
+ nativeSpecs = '--nodes=1 --ntasks=1'
+
+ # runner doesn't allow to specify --cpus-per-task
+ # thus the mem calculation gets messy with more than 1 node
+ # --> translate nt ==> nodes, nct ==> ntasks
+
+ if 'cond_threads' not in inp_data:
+ return JobDestination(runner="slurm")
+
+ if inp_data['cond_threads']['cond_threads_enabled'] == "True":
+ nNodes = int(inp_data['cond_threads']['nt'])
+ nCPU = int(inp_data['cond_threads']['nct'])
+ nMEM = int(inp_data['cond_threads']['mem'])
+ if nMEM > 0:
+ nativeSpecs = '--nodes=%d --ntasks=%d --mem=%d' % (nNodes, nCPU*nNodes, nMEM)
+ else:
+ nativeSpecs = '--nodes=%d --ntasks=%d' % (nNodes, nCPU*nNodes)
+
+ return JobDestination(runner="slurm", params={"nativeSpecification": nativeSpecs})
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/tool-data/picard_index.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/tool-data/picard_index.loc.sample Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of Picard dict and associated files. You will need
+#to create these data files and then create a picard_index.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The picard_index.loc
+#file has this format (longer white space is the TAB character):
+#
+#
+#
+#So, for example, if you had hg18 indexed and stored in
+#/depot/data2/galaxy/srma/hg18/,
+#then the srma_index.loc entry would look like this:
+#
+#hg18 hg18 hg18 Pretty /depot/data2/galaxy/picard/hg18/hg18.fa
+#
+#and your /depot/data2/galaxy/srma/hg18/ directory
+#would contain the following three files:
+#hg18.fa
+#hg18.dict
+#hg18.fa.fai
+#
+#The dictionary file for each reference (ex. hg18.dict) must be
+#created via Picard (http://picard.sourceforge.net). Note that
+#the dict file does not have the .fa extension although the
+#path list in the loc file does include it.
+#
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/tool_data_table_conf.xml.sample Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,7 @@
+
+
+
+ value, dbkey, name, path
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/gatk/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/gatk/tool_dependencies.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,19 @@
+
+
+
+ /mnt/galaxy/tools/GATK/3.4-0
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/package_r_for_gatk_3_4_0/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/package_r_for_gatk_3_4_0/tool_dependencies.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,48 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+ https://github.com/cran/stringi/archive/0.5-5.tar.gz
+ https://github.com/cran/magrittr/archive/1.5.tar.gz
+ https://github.com/cran/stringr/archive/1.0.0.tar.gz
+ https://github.com/cran/RColorBrewer/archive/1.1-2.tar.gz
+ https://github.com/cran/dichromat/archive/2.0-0.tar.gz
+ https://github.com/cran/colorspace/archive/1.2-6.tar.gz
+ https://github.com/cran/munsell/archive/0.4.2.tar.gz
+ https://github.com/cran/labeling/archive/0.3.tar.gz
+ https://github.com/cran/Rcpp/archive/0.11.6.tar.gz
+ https://github.com/cran/digest/archive/0.6.8.tar.gz
+ https://github.com/cran/gtable/archive/0.1.2.tar.gz
+ https://github.com/cran/bitops/archive/1.0-6.tar.gz
+ https://github.com/cran/caTools/archive/1.17.1.tar.gz
+ https://github.com/cran/gtools/archive/3.5.0.tar.gz
+ https://github.com/cran/gdata/archive/2.17.0.tar.gz
+ https://github.com/cran/gsalib/archive/2.1.tar.gz
+ https://github.com/cran/gplots/archive/2.17.0.tar.gz
+ https://github.com/cran/plyr/archive/1.8.3.tar.gz
+ https://github.com/cran/reshape/archive/0.8.5.tar.gz
+ https://github.com/cran/reshape2/archive/1.4.1.tar.gz
+ https://github.com/cran/scales/archive/0.2.5.tar.gz
+ https://github.com/cran/proto/archive/0.3-10.tar.gz
+ https://github.com/cran/MASS/archive/7.3-43.tar.gz
+ https://github.com/cran/ggplot2/archive/1.0.1.tar.gz
+
+
+
+
+ ggplot2 is a plotting system for R, based on the grammar of graphics, which tries to take the good parts of base and lattice graphics and none of the bad parts.
+ It takes care of many of the fiddly details that make plotting a hassle (like drawing legends) as well as providing a powerful model of graphics that makes it easy to produce complex multi-layered graphics.
+
+ http://ggplot2.org/
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/suite_samtools_1_2/repository_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/suite_samtools_1_2/repository_dependencies.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/BAM_preprocessingPairs.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/BAM_preprocessingPairs.pl Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,340 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+use Getopt::Std;
+my $version = '0.5_galaxy';
+
+my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools";
+
+my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." );
+
+getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY
+
+my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory";
+
+my $pt_bad_mates_file=$opts{b}; #GALAXY
+my $pt_log_file=$opts{l}; #GALAXY
+my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY
+
+
+die(qq/
+
+Description:
+
+ Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input
+ for SVDetect.
+
+ From all pairs mapped onto the reference genome, this script outputs abnormal pairs:
+ - mapped on two different chromosomes
+ - with an incorrect strand orientation and\/or pair order
+ - with an insert size distance +- sigma threshold
+ into a file
+
+ -BAM\/SAM File input format only, sorted by read names
+
+ Version : $version
+ SAMtools required for BAM files
+
+
+Usage: BAM_preprocessingPairs.pl [options]
+
+Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}]
+ -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}]
+ -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}]
+ -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}]
+ -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}]
+ -f REAL minimal number of sigma fold for filtering pairs [$opts{f}]
+ -d dump normal pairs into a file [] (optional)
+ -o STRING output directory [$working_dir]
+
+\n/) if (@ARGV == 0 && -t STDIN);
+
+unless (-d $opts{o}){
+ mkdir $opts{o} or die;
+}
+$opts{o}.="/" if($opts{o}!~/\/$/);
+
+my $mates_file=shift(@ARGV);
+
+$mates_file=readlink($mates_file);
+
+my $bad_mates_file=(split(/\//,$mates_file))[$#_];
+
+if($bad_mates_file=~/.(s|b)am$/){
+ $bad_mates_file=~s/.(b|s)am$/.ab.sam/;
+ $bad_mates_file=$opts{o}.$bad_mates_file;
+}
+
+else{
+ die "Error: mate_file with the extension <.bam> or <.sam> needed !\n";
+}
+
+my $good_mates_file;
+if($opts{d}){
+ $good_mates_file=(split(/\//,$mates_file))[$#_];
+ $good_mates_file=~s/.(b|s)am$/.norm.sam/;
+ $good_mates_file=$opts{o}.$good_mates_file;
+}
+
+my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY
+
+#------------------------------------------------------------------------------#
+#Calculate mu and sigma
+
+open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";
+
+print LOG "\# Calculating mu and sigma lengths...\n";
+print LOG "-- file=$mates_file\n";
+print LOG "-- n=$opts{n}\n";
+print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n";
+
+my ($record, $sumX,$sumX2) = (0,0,0);
+my $warn=$opts{n}/10;
+my $prev_pair="FIRST";
+
+my $bam=($mates_file =~ /.bam$/)? 1:0;
+
+if($bam){
+ open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
+}else{
+ open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
+}
+
+while(){
+
+ my @t=split;
+
+ next if ($t[0]=~/^@/);
+
+ my $current_pair=$t[0];
+ next if($current_pair eq $prev_pair);
+ $prev_pair=$current_pair;
+
+ my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8]));
+
+ next if (($t[1]&0x0004) || ($t[1]&0x0008));
+ next if ($length<$opts{s} || $length>$opts{S}) ;
+
+ if($chr2 eq "="){
+
+ $sumX += $length; #add to sum and sum^2 for mean and variance calculation
+ $sumX2 += $length*$length;
+ $record++;
+ }
+
+ if($record>$warn){
+ print LOG "-- $warn pairs analysed\n";
+ $warn+=$warn;
+ }
+
+ last if ($record>$opts{n});
+
+}
+close (MATES);
+
+$record--;
+my $mu = $sumX/$record;
+my $sigma = sqrt($sumX2/$record - $mu*$mu);
+
+print LOG "-- Total : $record pairs analysed\n";
+print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n";
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Preprocessing pairs
+
+$warn=100000;
+
+$record=0;
+my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0);
+
+my $read_type=($opts{t})? "Illumina":"SOLiD";
+my $pair_type=($opts{p})? "paired-end":"mate-paired";
+
+print LOG "\# Preprocessing pairs...\n";
+print LOG "-- file= $mates_file\n";
+print LOG "-- type= $read_type $pair_type reads\n";
+print LOG "-- sigma threshold= $opts{f}\n";
+print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n";
+
+my @header;
+
+if($bam){
+ open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
+ @header=;
+ close HEADER;
+ open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
+}else{
+ open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
+}
+
+open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n";
+print AB @header if($bam);
+
+if($opts{d}){
+ open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n";
+ print NORM @header if($bam);
+}
+
+$prev_pair="FIRST";
+my $prev_bad;
+
+while(){
+
+ my @t=split;
+ my $bad=0;
+
+ if ($t[0]=~/^@/){
+ print AB;
+ print NORM if ($opts{d});
+ next;
+ }
+
+ my $current_pair=$t[0];
+ if($current_pair eq $prev_pair){
+ next if($prev_bad==-1);
+ if($prev_bad){
+ print AB;
+ }elsif(!$prev_bad){
+ print NORM if($opts{d});
+ }
+ next;
+ }
+
+ $prev_pair=$current_pair;
+
+ my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8]));
+
+ if (($t[1]&0x0004) || ($t[1]&0x0008)){
+ $prev_bad=-1;
+ $count{unmap}++;
+ $record++;
+ next;
+
+ }
+
+ my $strand1 = (($t[1]&0x0010))? 'R':'F';
+ my $strand2 = (($t[1]&0x0020))? 'R':'F';
+ my $order1 = (($t[1]&0x0040))? '1':'2';
+ my $order2 = (($t[1]&0x0080))? '1':'2';
+
+ if($order1 == 2){
+ ($strand1,$strand2)=($strand2,$strand1);
+ ($chr1,$chr2)=($chr2,$chr1);
+ ($pos1,$pos2)=($pos2,$pos1);
+ ($order1,$order2)=($order2,$order1);
+ }
+
+ my $sense=$strand1.$strand2;
+
+ if($chr1 ne "=" && $chr2 ne "="){
+ $bad=1;
+ $count{chr}++;
+ }
+
+ if($opts{p}){ #paired-end
+ if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){
+ $bad=1;
+ $count{sense}++;
+ }
+ }else{ #mate-pair
+ if($opts{t}){ #Illumina
+ if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){
+ $bad=1;
+ $count{sense}++;
+ }
+ }else{ #SOLiD
+ if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){
+ $bad=1;
+ $count{sense}++;
+ }
+ }
+ }
+
+ if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){
+ $bad=1;
+ $count{dist}++;
+ }
+
+ if($bad){
+ print AB;
+ $count{ab}++;
+ $prev_bad=$bad;
+ }else{
+ print NORM if ($opts{d});
+ $count{norm}++;
+ $prev_bad=$bad;
+ }
+
+ $record++;
+
+ if($record>$warn){
+ print LOG "-- $warn pairs analysed\n";
+ $warn+=100000;
+ }
+}
+
+close AB;
+close NORM if($opts{d});
+
+print LOG "-- Total : $record pairs analysed\n";
+print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n";
+print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n";
+print LOG "---- $count{ab} abnormal mapped pairs\n";
+print LOG "------ $count{chr} pairs mapped on two different chromosomes\n";
+print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n";
+print LOG "------ $count{dist} pairs with incorrect insert size distance\n";
+print LOG "--- $count{norm} correct mapped pairs\n";
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#OUTPUT
+
+if($bam){
+
+ my $bam_file=$bad_mates_file;
+ $bam_file=~s/.sam$/.bam/;
+ print LOG "\# Converting sam to bam for abnormal mapped pairs\n";
+ system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log");
+ unlink($bad_mates_file);
+ print LOG "-- output created: $bam_file\n";
+
+ system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY
+
+ if($opts{d}){
+ $bam_file=$good_mates_file;
+ $bam_file=~s/.sam$/.bam/;
+ print LOG "\# Converting sam to bam for correct mapped pairs\n";
+ system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log");
+ unlink($good_mates_file);
+ print LOG "-- output created: $bam_file\n";
+
+ system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY
+
+ }
+
+}
+
+else{
+ print LOG "-- output created: $bad_mates_file\n";
+ print LOG "-- output created: $good_mates_file\n" if($opts{d});
+}
+
+close LOG;
+
+system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY
+
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub decimal{
+
+ my $num=shift;
+ my $digs_to_cut=shift;
+
+ $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/);
+
+ return $num;
+}
+#------------------------------------------------------------------------------#
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/BAM_preprocessingPairs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/BAM_preprocessingPairs.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,77 @@
+
+
+ to get abnormal pairs
+
+ BAM_preprocessingPairs.pl -t '$readType' -p '$pairType' -n '$nbrePair' -s '$isizeMin' -S '$isizeMax' -f '$foldPair' -o $__new_file_path__/svdetect -b '$abBAM' -l '$log' -N $sample_name
+ #if $newBam.pairNormal=="yes"
+ -d -x '$normBAM'
+ #end if
+ '$inputBam'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ newBam['pairNormal'] == 'yes'
+
+
+
+
+
+**What it does**
+
+Bam_preprocessingPairs - Version 0.5
+
+Preprocessing of mates to get anomalously mapped mate-pair/paired-end reads as input for SVDetect.
+
+From all pairs mapped onto the reference genome, this script outputs abnormal pairs:
+
+ * mapped on two different chromosomes
+ * with an incorrect strand orientation and/or pair order
+ * with an insert size distance +- sigma threshold
+
+into a file prefix.ab.bam/sam sorted by read names
+
+-BAM/SAM File input format only.
+
+SAMtools required for BAM files
+
+-----
+
+.. class:: infomark
+
+Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect.
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/SVDetect_compare.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/SVDetect_compare.pl Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,716 @@
+#!/usr/bin/perl -w
+
+=pod
+
+=head1 NAME
+
+SVDetect Compare for Galaxy
+
+Version: 0.8b for Galaxy
+
+=head1 SYNOPSIS
+
+SVDetect_compare.pl links2compare -conf [-help] [-man]
+
+=cut
+
+# -------------------------------------------------------------------
+
+use strict;
+use warnings;
+
+use Pod::Usage;
+use Getopt::Long;
+
+use Config::General;
+use Tie::IxHash;
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+#PARSE THE COMMAND LINE
+my %OPT;
+GetOptions(\%OPT,
+ 'conf=s',
+ 'out1=s', #GALAXY
+ 'out2=s', #GALAXY
+ 'out3=s', #GALAXY
+ 'out4=s', #GALAXY
+ 'out5=s', #GALAXY
+ 'out6=s', #GALAXY
+ 'out7=s', #GALAXY
+ 'out8=s', #GALAXY
+ 'out9=s', #GALAXY
+ 'l=s', #GALAXY
+ 'N=s', #GALAXY
+ 'help',
+ 'man'
+ );
+
+pod2usage() if $OPT{help};
+pod2usage(-verbose=>2) if $OPT{man};
+pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf});
+
+
+pod2usage() if(@ARGV<1);
+
+tie (my %func, 'Tie::IxHash',links2compare=>\&links2compare);
+
+foreach my $command (@ARGV){
+ pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command}));
+}
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+#READ THE CONFIGURATION FILE
+my $conf=Config::General->new( -ConfigFile => $OPT{conf},
+ -Tie => "Tie::IxHash",
+ -AllowMultiOptions => 1,
+ -LowerCaseNames => 1,
+ -AutoTrue => 1);
+my %CONF= $conf->getall;
+validateconfiguration(\%CONF); #validation of the configuration parameters
+
+
+my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY
+my $BEDTOOLS_BIN_DIR="/bioinfo/local/BEDTools/bin"; #GALAXY
+
+my $pt_log_file=$OPT{l}; #GALAXY
+my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_compare.log"; #GALAXY
+open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY
+
+my @pt_sv_file=($OPT{out1},$OPT{out2},$OPT{out3}) if($OPT{out1}); #GALAXY common,sample,reference
+my @pt_circos_file=($OPT{out4},$OPT{out5},$OPT{out6}) if($OPT{out4}); #GALAXY common,sample,reference
+my @pt_bed_file=($OPT{out7},$OPT{out8},$OPT{out9}) if($OPT{out7}); #GALAXY common,sample,reference
+
+$CONF{compare}{sample_link_file}=readlink($CONF{compare}{sample_link_file});#GALAXY
+$CONF{compare}{sample_link_file}=~s/.sv.txt//; #GALAXY
+
+$CONF{compare}{reference_link_file}=readlink($CONF{compare}{reference_link_file});#GALAXY
+$CONF{compare}{reference_link_file}=~s/.sv.txt//; #GALAXY
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+#COMMAND EXECUTION
+foreach my $command (@ARGV){
+ &{$func{$command}}();
+}
+print LOG "-- end\n";
+
+close LOG;#GALAXY
+system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY
+
+exit(0);
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+#FUNCTIONS
+
+# -----------------------------------------------------------------------------#
+#MAIN FUNCTION number 5:Comparison between samples, common or specific links
+sub links2compare{
+
+ my @compare_files;
+
+ compareSamples($CONF{general}{output_dir},
+ $CONF{compare}{list_samples},
+ $CONF{compare}{sample_link_file},
+ $CONF{compare}{reference_link_file},
+ $CONF{compare}{min_overlap},
+ $CONF{compare}{same_sv_type},
+ \@compare_files);
+
+ my $pt_ind=0;
+
+ for my $input_file (@compare_files){
+
+ $input_file=$CONF{general}{output_dir}.$input_file;
+
+ my $output_file=$input_file;
+ $output_file=~s/unique$/compared/;
+
+ sortLinks($input_file, $output_file,1);
+
+ if($CONF{compare}{circos_output}){
+ links2segdup($CONF{circos}{organism_id},
+ $CONF{circos}{colorcode},
+ $output_file,
+ $output_file.".segdup.txt");
+ system "rm $pt_circos_file[$pt_ind]; ln -s $output_file.segdup.txt $pt_circos_file[$pt_ind]" if (defined $pt_circos_file[$pt_ind]); #GALAXY
+ }
+
+ if($CONF{compare}{bed_output}){
+ links2bedfile($CONF{compare}{read_lengths},
+ $CONF{bed}{colorcode},
+ $output_file,
+ $output_file.".bed");
+ system "rm $pt_bed_file[$pt_ind]; ln -s $output_file.bed $pt_bed_file[$pt_ind]" if (defined $pt_bed_file[$pt_ind]); #GALAXY
+ }
+
+ if($CONF{compare}{sv_output}){
+
+ links2SVfile ($output_file, $output_file.".sv.txt");
+ system "rm $pt_sv_file[$pt_ind]; ln -s $output_file.sv.txt $pt_sv_file[$pt_ind]" if (defined $pt_sv_file[$pt_ind]); #GALAXY
+ }
+ $pt_ind++;
+
+ }
+ unlink(@compare_files);
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub compareSamples{
+
+ my ($dir,$list_samples,$sample_file,$reference_file,$min_overlap,$same_sv_type,$file_names)=@_;
+
+ my @bedpefiles;
+ my @list=split(",",$list_samples);
+ my @list_files=($sample_file,$reference_file);
+
+ print LOG "\# Comparison procedure...\n";
+ print LOG "-- samples=$list_samples\n".
+ "-- minimum overlap=$min_overlap\n".
+ "-- same SV type=$same_sv_type\n";
+
+ #conversion of links to bedPE format file
+ print LOG "-- Conversion of links.filtered files to bedPE format\n";
+ for my $s (0..$#list) {
+
+ links2bedPElinksfile($list[$s],$list_files[$s],$list_files[$s].".bedpe.txt");
+ push(@bedpefiles,$list_files[$s].".bedpe.txt");
+
+ }
+
+ #get common links between all samples compared
+ print LOG "-- Getting common links between all samples with BEDTools\n";
+ my $common_name=join(".",@list);
+
+ my $nb=scalar @list;
+ my $command="";
+ my $prompt=">";
+
+ while ($nb>0){
+
+ for my $i (0..$#list_files){
+
+ $command.="$BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a ".$list_files[$i].".bedpe.txt";
+ my $pipe=0;
+
+ for my $j ($i+1..$#list_files){
+
+ $command.="| $BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a stdin" if($pipe);
+ $command.=" -b ".$list_files[$j].".bedpe.txt";
+ $pipe=1;
+
+ }
+
+ $command.=$prompt.$dir.$common_name.".bedpe.tmp;";
+ $prompt=">>";
+
+ my $first=shift(@list_files); push(@list_files,$first);
+ last;
+ }
+ $nb--;
+ }
+
+ system ($command);
+
+ push(@bedpefiles,$dir.$common_name.".bedpe.tmp");
+
+ #Post comparison to get common links if same type only (as an option)
+ open( FILE, "<".$dir.$common_name.".bedpe.tmp") or die "Can't open".$dir.$common_name.".bedpe.tmp : $!";
+ open( OUT, ">".$dir.$common_name.".bedpe.unique") or die "Can't write in ".$dir.$common_name.".bedpe.unique : $!";
+
+ while(){
+ my @t=split("\t",$_);
+ my $s=(split("_",$t[6]))[0];
+ my ($sv1,$sv2)=($t[7],$t[18]);
+ splice(@t,11,$#t);
+
+ if($same_sv_type){
+ print OUT join("\t",@t)."\n" if($sv1 eq $sv2);
+ }else{
+ print OUT join("\t",@t)."\n";
+ }
+ }
+ close FILE;
+ close OUT;
+
+ bedPElinks2linksfile($dir.$common_name.".bedpe.unique", $dir.$common_name.".unique");
+ push(@bedpefiles,$dir.$common_name.".bedpe.unique");
+ push(@$file_names,$common_name.".unique");
+ print LOG "-- output created: ".$dir.$common_name.".compared\n";
+
+ #get specific links for each sample
+ print LOG "-- Getting specific links for each sample\n";
+ for my $s (0..$#list) {
+ system("grep -Fxv -f ".$dir.$common_name.".bedpe.unique ".$list_files[$s].".bedpe.txt >".$dir.$list[$s].".bedpe.unique");
+ bedPElinks2linksfile($dir.$list[$s].".bedpe.unique",$dir.$list[$s].".unique");
+ push(@bedpefiles,$dir.$list[$s].".bedpe.unique");
+ push(@$file_names,$list[$s].".unique");
+ print LOG "-- output created: ".$dir.$list[$s].".compared\n";
+ }
+
+ unlink(@bedpefiles);
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#convert the links file to the circos format
+sub links2segdup{
+
+ my($id,$color_code,$links_file,$segdup_file)=@_;
+
+ print LOG "\# Converting to the circos format...\n";
+
+ tie (my %hcolor,'Tie::IxHash'); #color-code hash table
+ foreach my $col (keys %{$color_code}){
+ my ($min_links,$max_links)=split(",",$color_code->{$col});
+ $hcolor{$col}=[$min_links,$max_links];
+ }
+
+ open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n";
+ open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n";
+
+ my $index=1;
+ while(){
+
+ my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6];
+
+ my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links
+
+ print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output
+ "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n";
+ $index++;
+ }
+
+ close LINKS;
+ close SEGDUP;
+ print LOG "-- output created: $segdup_file\n";
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#convert the links file to the bedPE format for BEDTools usage
+sub links2bedPElinksfile{
+
+ my ($sample,$links_file,$bedpe_file)=@_;
+
+ open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n";
+ open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n";
+
+ my $nb_links=1;
+
+ while(){
+
+ chomp;
+ my @t=split("\t",$_);
+ my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6);
+ my $type=($chr1 eq $chr2)? "INTRA":"INTER";
+ $type.="_".$t[10];
+
+ $start1--; $start2--;
+
+ print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2".
+ "\t$sample"."_link$nb_links\t$type\t.\t.".
+ "\t".join("|",@t)."\n";
+
+ $nb_links++;
+ }
+
+ close LINKS;
+ close BEDPE;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub bedPElinks2linksfile{
+
+ my ($bedpe_file,$links_file)=@_;
+
+ open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n";
+ open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n";
+
+ while(){
+
+ chomp;
+ my $sample=(split("_",(split("\t",$_))[6]))[0];
+ my @t1=(split("\t",$_))[0,1,2,3,4,5];
+ my @t2=split(/\|/,(split("\t",$_))[10]);
+ push(@t2,$sample);
+
+ print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n";
+
+ }
+ close BEDPE;
+ close LINKS;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#convert the links file to the bed format
+sub links2bedfile{
+
+ my ($tag_length,$color_code,$links_file,$bed_file)=@_;
+
+ print LOG "\# Converting to the bed format...\n";
+
+ my $compare=1;
+ if($links_file!~/compared$/){
+ $compare=0;
+ $tag_length->{none}->{1}=$tag_length->{1};
+ $tag_length->{none}->{2}=$tag_length->{2};
+ }
+
+ #color-code hash table
+ tie (my %hcolor,'Tie::IxHash');
+ my %color_order;
+ $color_order{"255,255,255"}=0;
+ my $n=1;
+ foreach my $col (keys %{$color_code}){
+ my ($min_links,$max_links)=split(",",$color_code->{$col});
+ $hcolor{$col}=[$min_links,$max_links];
+ $color_order{$col}=$n;
+ $n++;
+ }
+
+ my %pair;
+ my %pt;
+ $n=1;
+ open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n";
+
+ my %str=( "F"=>"+", "R"=>"-" );
+
+ while(){
+
+ my @t=split;
+ my $sample=($compare)? pop(@t):"none";
+
+ my $chr1=$t[0];
+ my $chr2=$t[3];
+ $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i);
+ $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i);
+ my $same_chr=($chr1 eq $chr2)? 1:0;
+
+ my $count=$t[6];
+ my $color=getColor($count,\%hcolor,"bed");
+
+ my @pairs=deleteBadOrderSensePairs(split(",",$t[7]));
+ my @strand1=deleteBadOrderSensePairs(split(",",$t[8]));
+ my @strand2=deleteBadOrderSensePairs(split(",",$t[9]));
+ my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10]));
+ my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11]));
+ my @position1=deleteBadOrderSensePairs(split(",",$t[14]));
+ my @position2=deleteBadOrderSensePairs(split(",",$t[15]));
+ my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample});
+ my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample});
+
+
+ for my $p (0..$#pairs){
+
+ if (!exists $pair{$pairs[$p]}){
+
+ if($same_chr){
+
+ $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]},
+ $start1[$p]-1, $end2[$p], $color,
+ 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ];
+ $pt{$n}=$pair{$pairs[$p]}->{0};
+ $n++;
+
+ }else{
+
+ $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]},
+ $start1[$p]-1, $end1[$p], $color,
+ 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0];
+ $pt{$n}=$pair{$pairs[$p]}->{1};
+ $n++;
+
+
+ $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]},
+ $start2[$p]-1, $end2[$p], $color,
+ 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0];
+ $pt{$n}=$pair{$pairs[$p]}->{2};
+ $n++;
+ }
+ }else{
+
+ if($same_chr){
+ ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]});
+ }else{
+ ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]});
+ ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]});
+ }
+ }
+ }
+ }
+ close LINKS;
+
+ my $nb_pairs=$n-1;
+
+ open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n";
+ print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ".
+ "visibility=2 itemRgb=\"On\"\n";
+
+ for my $i (1..$nb_pairs){
+ print BED join("\t",@{$pt{$i}})."\n";
+ }
+
+ close BED;
+
+ print LOG "-- output created: $bed_file\n";
+
+ undef %pair;
+ undef %pt;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub links2SVfile{
+
+ my($links_file,$sv_file)=@_;
+
+ print LOG "\# Converting to the sv output table...\n";
+ open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n";
+ open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n";
+
+ my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist
+ chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering
+ final_score breakpoint1_start1-end1 breakpoint2_start2-end2);
+
+ my $nb_links=0;
+
+ while (){
+
+ my @t=split;
+ my @sv=();
+ my $sv_type="-";
+ my $strand_ratio="-";
+ my $eq_ratio="-";
+ my $eq_type="-";
+ my $insert_ratio="-";
+ my $link="-";
+ my ($bk1, $bk2)=("-","-");
+ my $score="-";
+
+ my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]);
+ my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]);
+ my $nb_pairs=$t[6];
+ $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i);
+ $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i);
+ my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER";
+
+ #if strand filtering
+ if (defined $t[16]){
+ #if inter-chr link
+ $sv_type=$t[16];
+ if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){
+ $strand_ratio=floor($1/$2*100)."%";
+ $score=$t[18];
+ }
+ if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){
+ #if intra-chr link with insert size filtering
+ $strand_ratio=floor($1/$2*100)."%";
+ $link=floor($t[17]);
+ if($sv_type!~/^INV/){
+ $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/);
+ $score=$t[20];
+ }else{
+ $score=$t[19];
+ }
+ }
+ }
+
+ if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){
+
+ #if strand and order filtering only and/or interchr link
+ $eq_type=$t[18];
+ $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/);
+ ($bk1, $bk2)=($t[20],$t[21]);
+ foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;}
+ $score=$t[22];
+
+ }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){
+
+ #if all three filtering
+ $link=floor($t[17]);
+ $eq_type=$t[19];
+ $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/);
+
+ if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){
+ $insert_ratio=floor($1/$2*100)."%";
+ ($bk1, $bk2)=($t[22],$t[23]);
+ $score=$t[24];
+
+ }else{
+ ($bk1, $bk2)=($t[21],$t[22]);
+ $score=$t[23];
+ }
+ foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;}
+
+ }
+
+
+ push(@sv, $chr_type, $sv_type,$eq_type);
+ push(@sv,"$chr1\t$start1-$end1");
+ push(@sv, $link);
+ push(@sv,"$chr2\t$start2-$end2",
+ $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2);
+
+
+ print SV join("\t",@sv)."\n";
+ }
+
+ close LINKS;
+ close SV;
+
+ system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted";
+
+ open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n";
+ my @links=;
+ close SV;
+
+ open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n";
+
+ print SV join("\t",@header)."\n";
+ print SV @links;
+ close SV;
+
+ unlink($sv_file.".sorted");
+
+ print LOG "-- output created: $sv_file\n";
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub deleteBadOrderSensePairs{
+
+ my (@tab)=@_;
+ my @tab2;
+
+ foreach my $v (@tab){
+
+ $v=~s/[\(\)]//g;
+ push(@tab2,$v) if($v!~/[\$\*\@]$/);
+ }
+ return @tab2;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#gets starts and ends Coords when start=leftmost given positions, directions and orders
+sub getCoordswithLeftMost {
+
+ my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_;
+
+ for my $i (0..scalar(@{$positions})-1) {
+
+ if($strand->[$i] eq 'F'){
+ $starts->[$i]=$positions->[$i];
+ $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1;
+ }else{
+ $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1;
+ $ends->[$i]=$positions->[$i];
+ }
+ }
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub floor {
+ my $nb = $_[0];
+ $nb=~ s/\..*//;
+ return $nb;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub decimal{
+
+ my $num=shift;
+ my $digs_to_cut=shift;
+
+ $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/);
+
+ return $num;
+}
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Sort links according the concerned chromosomes and their coordinates
+sub sortLinks{
+
+ my ($links_file,$sortedlinks_file,$unique)=@_;
+
+ print LOG "# Sorting links...\n";
+
+ my $pipe=($unique)? "| sort -u":"";
+ system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file";
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getColor{
+
+ my($count,$hcolor,$format)=@_;
+ for my $col ( keys % { $hcolor} ) {
+ return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]);
+ }
+ return "white" if($format eq "circos");
+ return "255,255,255" if($format eq "bed");
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#check if the configuration file is correct
+sub validateconfiguration{
+
+ my %conf=%{$_[0]};
+ my $list_prgs="@ARGV";
+
+ my @circos_params=qw(organism_id colorcode);
+ my @bed_params=qw(colorcode);
+ my @compare_params=qw(list_samples list_read_lengths sample_link_file reference_link_file);
+
+ unless (defined($conf{general}{output_dir})) {
+ $conf{general}{output_dir} = ".";
+ }
+ unless (-d $conf{general}{output_dir}){
+ mkdir $conf{general}{output_dir} or die;
+ }
+ $conf{general}{output_dir}.="/" if($conf{general}{output_dir}!~/\/$/);
+
+
+ if($list_prgs=~/links2compare/){
+ foreach my $p (@compare_params) {
+ die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p});
+ }
+
+ unless (defined($conf{compare}{same_sv_type})) {
+ $conf{compare}{same_sv_type} = 0;
+ }
+
+ unless (defined($conf{compare}{min_overlap})) {
+ $conf{compare}{min_overlap} = 1E-9;
+ }
+
+ if($conf{compare}{circos_output}){
+ foreach my $p (@circos_params) {
+ next if($list_prgs=~/^ratio/ && $p eq "colorcode");
+ die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p});
+ }
+ }
+ if($conf{compare}{bed_output}){
+ foreach my $p (@bed_params) {
+ die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p});
+ }
+ die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths});
+
+ my @samples=split(",",$conf{compare}{list_samples});
+ my @read_lengths=split(",",$conf{compare}{list_read_lengths});
+ for my $i (0..$#samples){
+ my @l=split("-",$read_lengths[$i]);
+ $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]};
+ }
+ }
+ }
+
+
+}
+#------------------------------------------------------------------------------#
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/SVDetect_compare.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/SVDetect_compare.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,218 @@
+
+
+structural variants between two samples
+
+SVDetect_compare.pl links2compare -conf '$config_file' -l '$log_file' -N '$sample_name.$reference_name'
+
+#if $links2SV
+-out1 '$common_sv_file'
+-out2 '$sample_sv_file'
+-out3 '$reference_sv_file'
+#end if
+
+#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2circos
+-out4 '$common_circos_file'
+-out5 '$sample_circos_file'
+-out6 '$reference_circos_file'
+#end if
+
+#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2bed
+-out7 '$common_bed_file'
+-out8 '$sample_bed_file'
+-out9 '$reference_bed_file'
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ links2SV is True
+
+
+ links2SV is True
+
+
+ links2SV is True
+
+
+
+ (
+ file_conversion['file_conversion_select']=="convert" and
+ file_conversion['links2circos'] is True
+ )
+
+
+
+ (
+ file_conversion['file_conversion_select']=="convert" and
+ file_conversion['links2circos'] is True
+ )
+
+
+
+ (
+ file_conversion['file_conversion_select']=="convert" and
+ file_conversion['links2circos'] is True
+ )
+
+
+
+
+ (
+ file_conversion['file_conversion_select']=="convert" and
+ file_conversion['links2bed'] is True
+ )
+
+
+
+ (
+ file_conversion['file_conversion_select']=="convert" and
+ file_conversion['links2bed'] is True
+ )
+
+
+
+ (
+ file_conversion['file_conversion_select']=="convert" and
+ file_conversion['links2bed'] is True
+ )
+
+
+
+
+
+
+
+
+
+
+<general>
+output_dir=$__new_file_path__/svdetect
+</general>
+
+#if $file_conversion.file_conversion_select == "convert"
+#if $file_conversion.links2circos
+<circos>
+organism_id=${file_conversion.organism_id}
+<colorcode>
+#for $color_repeat in $file_conversion.color_code
+${color_repeat.color}=${color_repeat.interval}
+#end for
+</colorcode>
+</circos>
+#end if
+#if $file_conversion.links2bed
+<bed>
+<colorcode>
+#for $color_repeat in $file_conversion.color_code
+#if str($color_repeat.color)== "grey"
+190,190,190=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "black"
+0,0,0=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "blue"
+0,0,255=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "green"
+0,255,0=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "purple"
+153,50,205=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "orange"
+255,140,0=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "red"
+255,0,0=${color_repeat.interval}
+#end if
+#end for
+</colorcode>
+</bed>
+#end if
+#end if
+
+<compare>
+list_samples=${sample_name},${reference_name}
+list_read_lengths=${sample_read1_length}-${sample_read2_length},${reference_read1_length}-${reference_read2_length}
+sample_link_file=${sample_mates_file}
+reference_link_file=${reference_mates_file}
+min_overlap=${min_overlap}
+same_sv_type=${same_sv_type}
+sv_output=${links2SV}
+#if $file_conversion.file_conversion_select == "convert"
+circos_output=${$file_conversion.links2circos}
+bed_output=${$file_conversion.links2bed}
+#end if
+</compare>
+
+
+
+
+
+**What it does**
+
+SVDetect - Version : 0.8b
+
+Comparison of clusters between two samples to get common or sample-specific SVs
+
+This program is designed to compare filtered links between two anomalously mapped mate-pair/paired-end datasets
+and to identify common and sample-specific SVs (like the usual sample/reference design).
+Overlaps between coordinates of clusters and types of SVs are used as parameters of comparison.
+
+Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html
+
+-----
+
+.. class:: infomark
+
+Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect.
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/SVDetect_import.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/SVDetect_import.sh Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+
+while getopts "i:o:" optionName; do
+case "$optionName" in
+
+i) INPUT="$OPTARG";;
+o) OUTPUT="$OPTARG";;
+
+esac
+done
+
+rm $OUTPUT
+
+ln -s $INPUT $OUTPUT
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/SVDetect_import.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/SVDetect_import.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,85 @@
+
+ BAM, chromosome info or sv files
+ SVDetect_import.sh -i $file_path
+ #if str($type.file_type)=="bam"
+ -o $outbamfile
+ #elif str($type.file_type)=="len"
+ -o $outlenfile
+ #elif str($type.file_type)=="sv"
+ -o $outsvfile
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ type['file_type']=="bam"
+
+
+ type['file_type']=="len"
+
+
+ type['file_type']=="sv"
+
+
+
+**What it does**
+
+This tool allows you to import quickly a BAM file, a chromosome info file or a SVDetect output file from you computer as inputs for SVDetect.
+
+
+**Example of chromosome file**
+
+Input len file::
+
+ 1 chr1 247249719
+ 2 chr2 242951149
+ 3 chr3 199501827
+ 4 chr4 191273063
+ 5 chr5 180857866
+ 6 chr6 170899992
+ 7 chr7 158821424
+ 8 chr8 146274826
+ 9 chr9 140273252
+ 10 chr10 135374737
+ 11 chr11 134452384
+ 12 chr12 132349534
+ 13 chr13 114142980
+ 14 chr14 106368585
+ 15 chr15 100338915
+ 16 chr16 88827254
+ 17 chr17 78774742
+ 18 chr18 76117153
+ 19 chr19 63811651
+ 20 chr20 62435964
+ 21 chr21 46944323
+ 22 chr22 49691432
+ 23 chrX 154913754
+ 24 chrY 57772954
+
+-----
+
+.. class:: infomark
+
+Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect.
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/SVDetect_run_parallel.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/SVDetect_run_parallel.pl Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,3537 @@
+#!/usr/bin/perl -w
+
+=pod
+
+=head1 NAME
+
+SVDetect - Program designed to the detection of structural variations
+from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads
+
+Version: 0.8b for Galaxy
+
+=head1 SYNOPSIS
+
+SVDetect -conf [-help] [-man]
+
+ Command:
+
+ linking detection and isolation of links
+ filtering filtering of links according different parameters
+ links2circos links conversion to circos format
+ links2bed paired-ends of links converted to bed format (UCSC)
+ links2SV formatted output to show most significant SVs
+ cnv calculate copy-number profiles
+ ratio2circos ratio conversion to circos density format
+ ratio2bedgraph ratio conversion to bedGraph density format (UCSC)
+
+=head1 DESCRIPTION
+
+This is a command-line interface to SVDetect.
+
+
+=head1 AUTHORS
+
+Bruno Zeitouni Ebruno.zeitouni@curie.frE,
+Valentina Boeva Evalentina.boeva@curie.frE
+
+=cut
+
+# -------------------------------------------------------------------
+
+use strict;
+use warnings;
+
+use Pod::Usage;
+use Getopt::Long;
+
+use Config::General;
+use Tie::IxHash;
+use FileHandle;
+use Parallel::ForkManager;
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+#PARSE THE COMMAND LINE
+my %OPT;
+GetOptions(\%OPT,
+ 'conf=s',
+ 'out1=s', #GALAXY
+ 'out2=s', #GALAXY
+ 'out3=s', #GALAXY
+ 'out4=s', #GALAXY
+ 'out5=s', #GALAXY
+ 'l=s', #GALAXY
+ 'N=s',#GALAXY
+ 'help',#GALAXY
+ 'man'
+ );
+
+pod2usage() if $OPT{help};
+pod2usage(-verbose=>2) if $OPT{man};
+pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf});
+
+pod2usage() if(@ARGV<1);
+
+tie (my %func, 'Tie::IxHash',linking=>\&createlinks,
+ filtering=>\&filterlinks,
+ links2circos=>\&links2circos,
+ links2bed=>\&links2bed,
+ links2compare=>\&links2compare,
+ links2SV=>\&links2SV,
+ cnv=>\&cnv,
+ ratio2circos=>\&ratio2circos,
+ ratio2bedgraph=>\&ratio2bedgraph);
+
+foreach my $command (@ARGV){
+ pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command}));
+}
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+#READ THE CONFIGURATION FILE
+my $conf=Config::General->new( -ConfigFile => $OPT{conf},
+ -Tie => "Tie::IxHash",
+ -AllowMultiOptions => 1,
+ -LowerCaseNames => 1,
+ -AutoTrue => 1);
+my %CONF= $conf->getall;
+validateconfiguration(\%CONF); #validation of the configuration parameters
+
+my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY
+
+my $pt_log_file=$OPT{l}; #GALAXY
+my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY
+my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY
+my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY
+my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY
+my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY
+
+$CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY
+$CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY
+
+my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY
+open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+#COMMAND EXECUTION
+foreach my $command (@ARGV){
+ &{$func{$command}}();
+}
+print LOG "-- end\n";#GALAXY
+
+close LOG;#GALAXY
+system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY
+exit(0);
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+
+
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
+#FUNCTIONS
+#------------------------------------------------------------------------------#
+#MAIN FUNCTION number 1: Detection of links from mate-pairs data
+sub createlinks{
+
+ my %CHR; #main hash table 1: fragments, links
+ my %CHRID;
+ my @MATEFILES;
+
+ my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type};
+ my @path=split(/\//,$output_prefix);
+ $output_prefix=$CONF{general}{output_dir}.$path[$#path];
+ my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path];
+ my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path];
+
+ shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters
+ $CONF{detection}{window_size},
+ $CONF{detection}{step_length},
+ $CONF{general}{cmap_file});
+
+ if($CONF{detection}{split_mate_file}){
+
+ splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix,
+ $CONF{general}{sv_type},
+ $CONF{general}{mates_file},
+ $CONF{general}{input_format},
+ $CONF{general}{read_lengths}
+ );
+ }else{
+
+ @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!";
+ chomp(@MATEFILES);
+ print LOG "# Splitted mate files already created.\n";
+ }
+
+
+ #Parallelization of the linking per chromosome for intra + interchrs
+ my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
+
+ foreach my $matefile (@MATEFILES){
+
+ my $pid = $pm->start and next;
+ getlinks(\%CHR, \%CHRID, $matefile);
+ $pm->finish;
+
+ }
+ $pm->wait_all_children;
+
+ #Merge the chromosome links file into only one
+ my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!";
+ chomp(@LINKFILES);
+ catFiles( \@LINKFILES => "$output_prefix.links" );
+
+ system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY
+ print LOG "# Linking end procedure : output created: $output_prefix.links\n";
+ #unlink(@LINKFILES);
+ #unlink(@MATEFILES);
+
+ undef %CHR;
+ undef %CHRID;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getlinks {
+
+ my ($chr,$chrID,$tmp_mates_prefix)=@_;
+
+ my $tmp_links_prefix=$tmp_mates_prefix;
+ $tmp_links_prefix=~s/\/mates\//\/links\//;
+
+ my %PAIR; #main hash table 2: pairs
+
+ linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs
+ $CONF{general}{read_lengths},
+ $CONF{detection}{window_size},
+ $CONF{detection}{step_length},
+ $tmp_mates_prefix,
+ $CONF{general}{input_format},
+ $CONF{general}{sv_type},
+ "$tmp_links_prefix.links.mapped"
+ );
+
+ getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons
+ "$tmp_links_prefix.links.unique");
+
+ defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links
+ $CONF{general}{input_format},
+ $CONF{general}{sv_type},
+ $CONF{general}{read_lengths},
+ "$tmp_links_prefix.links.unique",
+ "$tmp_links_prefix.links.unique_defined");
+
+ sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates
+ "$tmp_links_prefix.links.sorted");
+
+ removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links
+ "$tmp_links_prefix.links",1); #file output
+
+
+ undef %PAIR;
+
+ unlink("$tmp_links_prefix.links.mapped",
+ "$tmp_links_prefix.links.unique",
+ "$tmp_links_prefix.links.unique_defined",
+ "$tmp_links_prefix.links.sorted");
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub splitMateFile{
+
+ my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_;
+
+ print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n";
+
+ my %filesHandle;
+
+ #fichier matefile inter
+ if($sv_type=~/^(all|inter)$/){
+ my $newFileName="$output_prefix.interchrs";
+ push(@{$files_list},$newFileName);
+ my $fh = new FileHandle;
+ $fh->open(">$newFileName");
+ $filesHandle{inter}=$fh;
+ }
+
+ #fichiers matefiles intra
+ if($sv_type=~/^(all|intra)$/){
+ foreach my $k (1..$chr->{nb_chrs}){
+ my $newFileName=$output_prefix.".".$chr->{$k}->{name};
+ push(@{$files_list},$newFileName);
+ my $fh = new FileHandle;
+ $fh->open(">$newFileName");
+ $filesHandle{$k}=$fh;
+ }
+ }
+
+ if ($mates_file =~ /.gz$/) {
+ open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat
+ }elsif($mates_file =~ /.bam$/){
+ open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
+ }else{
+ open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n";
+ }
+
+
+ while(){
+
+ my @t=split;
+ my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2);
+
+ next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
+
+ next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
+
+ ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
+
+ if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){
+ my $fh2print=$filesHandle{inter};
+ print $fh2print join("\t",@t)."\n";
+ }
+
+ if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){
+ my $fh2print=$filesHandle{$chr_read1};
+ print $fh2print join("\t",@t)."\n";
+
+ }
+ }
+
+ foreach my $name (keys %filesHandle){
+ my $fh=$filesHandle{$name};
+ $fh->close;
+ }
+
+ print LOG "# Splitted mate files of \"$mates_file\" created.\n";
+}
+
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub filterlinks{
+
+ my %CHR;
+ my %CHRID;
+ my @LINKFILES;
+ my @FLINKFILES;
+
+ my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type};
+ my @path=split(/\//,$output_prefix);
+ $output_prefix=$CONF{general}{output_dir}.$path[$#path];
+ my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path];
+
+ createChrHashTables(\%CHR,\%CHRID,
+ $CONF{general}{cmap_file});
+
+ if($CONF{filtering}{split_link_file}){
+
+ splitLinkFile(\%CHR, \%CHRID, \@LINKFILES,
+ $tmp_links_prefix,
+ $CONF{general}{sv_type},
+ "$output_prefix.links",
+ );
+ }else{
+
+ @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n";
+ chomp(@LINKFILES);
+ print LOG "# Splitted link files already created.\n";
+ }
+
+ #Parallelization of the filtering per chromosome for intra + interchrs
+ my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
+
+ foreach my $linkfile (@LINKFILES){
+
+ my $pid = $pm->start and next;
+ getFilteredlinks(\%CHR, \%CHRID, $linkfile);
+ $pm->finish;
+
+ }
+ $pm->wait_all_children;
+
+ #Merge the chromosome links file into only one
+ @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n";
+ chomp(@FLINKFILES);
+ catFiles( \@FLINKFILES => "$output_prefix.links.filtered" );
+
+ system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY
+ print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n";
+
+ undef %CHR;
+ undef %CHRID;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub splitLinkFile{
+
+ my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_;
+
+ print LOG "# Splitting the link file for parallel processing...\n";
+
+ my %filesHandle;
+
+ #fichier matefile inter
+ if($sv_type=~/^(all|inter)$/){
+ my $newFileName="$input_prefix.interchrs.links";
+ push(@{$files_list},$newFileName);
+ my $fh = new FileHandle;
+ $fh->open(">$newFileName");
+ $filesHandle{inter}=$fh;
+ }
+
+ #fichiers matefiles intra
+ if($sv_type=~/^(all|intra)$/){
+ foreach my $k (1..$chr->{nb_chrs}){
+ my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links";
+ push(@{$files_list},$newFileName);
+ my $fh = new FileHandle;
+ $fh->open(">$newFileName");
+ $filesHandle{$k}=$fh;
+ }
+ }
+
+ open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n";
+ while(){
+
+ my @t=split;
+ my ($chr_read1,$chr_read2)=($t[0],$t[3]);
+
+ next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
+
+ ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
+
+ if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){
+ my $fh2print=$filesHandle{inter};
+ print $fh2print join("\t",@t)."\n";
+ }
+
+ if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){
+ my $fh2print=$filesHandle{$chr_read1};
+ print $fh2print join("\t",@t)."\n";
+
+ }
+ }
+
+ foreach my $name (keys %filesHandle){
+ my $fh=$filesHandle{$name};
+ $fh->close;
+ }
+
+ print LOG "# Splitted link files created.\n";
+}
+
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#MAIN FUNCTION number 2: Filtering processing
+sub getFilteredlinks {
+
+ my ($chr,$chrID,$tmp_links_prefix)=@_;
+ my %PAIR;
+
+ strandFiltering($chr,$chrID,
+ $CONF{filtering}{nb_pairs_threshold}, #filtering of links
+ $CONF{filtering}{strand_filtering},
+ $CONF{filtering}{chromosomes},
+ $CONF{general}{input_format},
+ $CONF{general}{cmap_file},
+ $CONF{general}{mates_orientation},
+ $CONF{general}{read_lengths},
+ $tmp_links_prefix,
+ "$tmp_links_prefix.filtered",
+ );
+
+ if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering
+
+ my @tmpfiles;
+
+ rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique");
+
+ getUniqueLinks("$tmp_links_prefix.filtered_unique",
+ "$tmp_links_prefix.filtered");
+
+ push(@tmpfiles,"$tmp_links_prefix.filtered_unique");
+
+ if($CONF{filtering}{order_filtering}){ #filtering using the order
+
+ rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered");
+
+ orderFiltering($chr,$chrID,
+ $CONF{filtering}{nb_pairs_threshold},
+ $CONF{filtering}{nb_pairs_order_threshold},
+ $CONF{filtering}{mu_length},
+ $CONF{filtering}{sigma_length},
+ $CONF{general}{mates_orientation},
+ $CONF{general}{read_lengths},
+ "$tmp_links_prefix.filtered_ordered",
+ "$tmp_links_prefix.filtered",
+ );
+
+ push(@tmpfiles,"$tmp_links_prefix.filtered_ordered");
+ }
+
+ if (($CONF{filtering}{insert_size_filtering})&&
+ ($CONF{general}{sv_type} ne 'inter')){
+
+ rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize");
+
+ addInsertionInfo($chr,$chrID,
+ $CONF{filtering}{nb_pairs_threshold},
+ $CONF{filtering}{order_filtering},
+ $CONF{filtering}{indel_sigma_threshold},
+ $CONF{filtering}{dup_sigma_threshold},
+ $CONF{filtering}{singleton_sigma_threshold},
+ $CONF{filtering}{mu_length},
+ $CONF{filtering}{sigma_length},
+ $CONF{general}{mates_orientation},
+ $CONF{general}{read_lengths},
+ "$tmp_links_prefix.filtered_withoutIndelSize",
+ "$tmp_links_prefix.filtered"
+ );
+
+ push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize");
+ }
+
+ sortLinks("$tmp_links_prefix.filtered",
+ "$tmp_links_prefix.filtered_sorted");
+
+ removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted",
+ "$tmp_links_prefix.filtered_nodup",
+ );
+
+ postFiltering("$tmp_links_prefix.filtered_nodup",
+ "$tmp_links_prefix.filtered",
+ $CONF{filtering}{final_score_threshold});
+
+ push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup");
+
+ unlink(@tmpfiles);
+
+
+ }
+ undef %PAIR;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#MAIN FUNCTION number 3: Circos format conversion for links
+sub links2circos{
+
+ my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
+ my @path=split(/\//,$input_file);
+ $input_file=$CONF{general}{output_dir}.$path[$#path];
+
+ my $output_file.=$input_file.".segdup.txt";
+
+ links2segdup($CONF{circos}{organism_id},
+ $CONF{circos}{colorcode},
+ $input_file,
+ $output_file); #circos file output
+
+ system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#MAIN FUNCTION number 4: Bed format conversion for links
+sub links2bed{
+
+ my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
+ my @path=split(/\//,$input_file);
+ $input_file=$CONF{general}{output_dir}.$path[$#path];
+
+ my $output_file.=$input_file.".bed.txt";
+
+ links2bedfile($CONF{general}{read_lengths},
+ $CONF{bed}{colorcode},
+ $input_file,
+ $output_file); #bed file output
+
+ system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#MAIN FUNCTION number 6: Bed format conversion for links
+sub links2SV{
+
+ my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered";
+
+ my @path=split(/\//,$input_file);
+ $input_file=$CONF{general}{output_dir}.$path[$#path];
+
+ my $output_file.=$input_file.".sv.txt";
+
+
+ links2SVfile( $input_file,
+ $output_file);
+
+ system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#MAIN FUNCTION number 7: copy number variations, coverage ratio calculation
+sub cnv{
+
+ my %CHR;
+ my %CHRID;
+ my @MATEFILES;
+ my @MATEFILES_REF;
+
+ my $output_prefix=$CONF{general}{mates_file};
+ my $output_prefix_ref=$CONF{detection}{mates_file_ref};
+ my @path=split(/\//,$output_prefix);
+ my @path_ref=split(/\//,$output_prefix_ref);
+ $output_prefix=$CONF{general}{output_dir}.$path[$#path];
+ $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref];
+ my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path];
+ my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref];
+ my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path];
+
+ shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters
+ $CONF{detection}{window_size},
+ $CONF{detection}{step_length},
+ $CONF{general}{cmap_file});
+
+ if($CONF{detection}{split_mate_file}){
+
+ splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix,
+ "intra",
+ $CONF{general}{mates_file},
+ $CONF{general}{input_format},
+ $CONF{general}{read_lengths}
+ );
+
+ splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref,
+ "intra",
+ $CONF{detection}{mates_file_ref},
+ $CONF{general}{input_format},
+ $CONF{general}{read_lengths}
+ );
+
+
+ }else{
+
+ @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!";
+ chomp(@MATEFILES);
+ @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!";
+ chomp(@MATEFILES_REF);
+ print LOG "# Splitted sample and reference mate files already created.\n";
+ }
+
+ #Parallelization of the cnv per chromosome
+ my $pm = new Parallel::ForkManager($CONF{general}{num_threads});
+
+ foreach my $file (0..$#MATEFILES){
+
+ my $pid = $pm->start and next;
+
+ densityCalculation(\%CHR, \%CHRID, $file,
+ $CONF{general}{read_lengths},
+ $CONF{detection}{window_size},
+ $CONF{detection}{step_length},
+ \@MATEFILES,
+ \@MATEFILES_REF,
+ $MATEFILES[$file].".density",
+ $CONF{general}{input_format});
+
+ $pm->finish;
+
+ }
+ $pm->wait_all_children;
+
+ #Merge the chromosome links file into only one
+ my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!";
+ chomp(@DENSITYFILES);
+ catFiles( \@DENSITYFILES => "$output_prefix.density" );
+
+ print LOG "# cnv end procedure : output created: $output_prefix.density\n";
+
+
+ undef %CHR;
+ undef %CHRID;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#MAIN FUNCTION number 8: Circos format conversion for cnv ratios
+sub ratio2circos{
+
+ my $input_file=$CONF{general}{mates_file}.".density";
+
+ my @path=split(/\//,$input_file);
+ $input_file=$CONF{general}{output_dir}.$path[$#path];
+
+ my $output_file.=$input_file.".segdup.txt";
+
+ ratio2segdup($CONF{circos}{organism_id},
+ $input_file,
+ $output_file);
+}
+#------------------------------------------------------------------------------#
+#MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios
+sub ratio2bedgraph{
+
+ my $input_file=$CONF{general}{mates_file}.".density";
+
+ my @path=split(/\//,$input_file);
+ $input_file=$CONF{general}{output_dir}.$path[$#path];
+
+ my $output_file.=$input_file.".bedgraph.txt";
+
+ ratio2bedfile($input_file,
+ $output_file); #bed file output
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Creation of the fragment library
+sub shearingChromosome{
+
+ print LOG "# Making the fragments library...\n";
+
+ my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters
+
+ createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name
+
+ foreach my $k (1..$chr->{nb_chrs}){
+
+ print LOG"-- $chr->{$k}->{name}\n";
+
+ my $frag=1;
+ for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){
+
+ my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1;
+ $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage
+
+ if($end==($chr->{$k}->{length})-1){
+ $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome
+ last;
+ }
+ $frag++;
+ }
+ }
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Creation of chromosome hash tables from the cmap file
+sub createChrHashTables{
+
+ my ($chr,$chrID,$cmap_file)=@_;
+ $chr->{nb_chrs}=0;
+
+ open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n";
+ while(){
+
+ if(/^\s+$/){ next;}
+ my ($k,$name,$length) = split;
+ $chr->{$k}->{name}=$name;
+ $chr->{$k}->{length}=$length;
+ $chrID->{$name}=$k;
+ $chr->{nb_chrs}++;
+
+ }
+ close CMAP;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Read the mate file according the input format file (solid, eland or sam)
+sub readMateFile{
+
+ my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_;
+ my ($strand1,$strand2);
+
+ if($file_type eq "solid"){
+
+ ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based
+
+ }else{
+ my ($tag_length1,$tag_length2);
+ ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based
+ if($file_type eq "eland");
+
+ if($file_type eq "sam"){
+
+ return 0 if ($$t[0]=~/^@/); #header sam filtered out
+
+ ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]);
+
+ return 0 if (($$t[1]&0x0004) || ($$t[1]&0x0008));
+
+ $$chr2=$$chr1 if($$chr2 eq "=");
+
+ $strand1 = (($$t[1]&0x0010))? 'R':'F';
+ $strand2 = (($$t[1]&0x0020))? 'R':'F';
+
+ $$order1= (($$t[1]&0x0040))? '1':'2';
+ $$order2= (($$t[1]&0x0080))? '1':'2';
+ $tag_length1 = $tag_length->{$$order1};
+ $tag_length2 = $tag_length->{$$order2};
+ }
+
+ $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts
+ $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R");
+ }
+ return 1;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Parsing of the mates files and creation of links between 2 chromosomal fragments
+sub linking{
+
+ my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_;
+ my %link;
+
+ my $record=0;
+ my $nb_links=0;
+ my $warn=10000;
+
+ my @sfile=split(/\./,$mates_file);
+ my $fchr=$sfile[$#sfile];
+
+ my $fh = new FileHandle;
+
+ print LOG "# $fchr : Linking procedure...\n";
+ print LOG "-- file=$mates_file\n".
+ "-- chromosome=$fchr\n".
+ "-- input format=$input_format\n".
+ "-- type=$sv_type\n".
+ "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n".
+ "-- window size=$window_dist, step length=$step\n";
+
+ if ($mates_file =~ /.gz$/) {
+ $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat
+ }elsif($mates_file =~ /.bam$/){
+ $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
+ }else{
+ $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n";
+ }
+
+
+ while(<$fh>){
+
+ my @t=split; #for each mate-pair
+ my $mate=$t[0];
+ my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2);
+
+ next if(exists $$pair{$mate});
+
+ next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
+
+ next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2});
+
+ ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
+
+ if($sv_type ne "all"){
+ if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) ||
+ ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){
+ }else{
+ next;
+ }
+ }
+
+ $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function)
+
+ $record++;
+
+ my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read
+
+ recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format);
+ recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format);
+
+ for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation
+
+ if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){
+
+ if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){
+
+ for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){
+
+ if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) {
+
+ if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){
+
+ makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link
+ }
+
+ }else{
+
+ $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step);
+ }
+ }
+ }
+
+ }else{
+
+ $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step);
+ }
+ }
+
+ if($record>=$warn){
+ print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n";
+ $warn+=10000;
+ }
+ }
+ $fh->close;
+
+ if(!$nb_links){
+ print LOG "-- $fchr : No mate-pairs !\n".
+ "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n";
+ }
+
+ print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n";
+
+ print LOG "-- $fchr : writing...\n";
+
+ $fh = new FileHandle;
+
+ $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n";
+
+ foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output
+
+ foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){
+
+ foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){
+
+ foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){
+
+ my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2});
+ print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t".
+ "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t".
+ scalar @count."\t". #nb of read
+ $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list
+ }
+ }
+ }
+ }
+
+ $fh->close;
+
+ undef %link;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#remove exact links doublons according to the mate list
+sub getUniqueLinks{
+
+ my ($links_file,$nrlinks_file)=@_;
+ my %links;
+ my %pt;
+ my $nb_links;
+ my $n=1;
+
+ my $record=0;
+ my $warn=300000;
+
+ my @sfile=split(/\./,$links_file);
+ my $fchr=$sfile[$#sfile-2];
+
+ my $fh = new FileHandle;
+
+ print LOG "# $fchr : Getting unique links...\n";
+ $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
+
+ while(<$fh>){
+
+ my @t=split;
+ my $mates=$t[7];
+ $record++;
+
+ if(!exists $links{$mates}){ #Unique links selection
+
+ $links{$mates}=[@t];
+ $pt{$n}=$links{$mates};
+ $n++;
+
+
+ }else{ #get the link coordinates from the mate-pairs list
+
+ for my $i (1,2,4,5){ #get the shortest regions
+
+ $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start
+ if($i==1 || $i==4);
+ $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end
+ if($i==2 || $i==5);
+ }
+ }
+ if($record>=$warn){
+ print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n";
+ $warn+=300000;
+ }
+ }
+ $fh->close;
+
+ $nb_links=$n-1;
+ print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n";
+
+ $fh = new FileHandle;
+ $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n";
+ print LOG "-- $fchr : writing...\n";
+ for my $i (1..$nb_links){
+
+ print $fh join("\t",@{$pt{$i}})."\n"; #all links output
+ }
+
+ $fh->close;
+
+ undef %links;
+ undef %pt;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#get the new coordinates of each link from the mate list
+sub defineCoordsLinks{
+
+ my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_;
+
+ my @sfile=split(/\./,$links_file);
+ my $fchr=$sfile[$#sfile-2];
+
+ my $fh = new FileHandle;
+ my $fh2 = new FileHandle;
+
+ $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
+ $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n";
+
+ print LOG "# $fchr : Defining precise link coordinates...\n";
+
+ my $record=0;
+ my $warn=100000;
+
+ my %coords;
+ my %strands;
+ my %order;
+ my %ends_order;
+
+ while(<$fh>){
+
+
+ my ($col1,$col2)=(1,2); #for an intrachromosomal link
+ my $diffchr=0; #difference between chr1 and chr2
+ my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8];
+ ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2});
+ if ($chr1 != $chr2){ #for an interchromosomal link
+ $col1=$col2=0; #no distinction
+ $diffchr=1;
+ }
+
+ my @pairs=split(",",$mates_list);
+
+ $coords{$col1}{$chr1}->{start}=undef;
+ $coords{$col1}{$chr1}->{end}=undef;
+ $coords{$col2}{$chr2}->{start}=undef;
+ $coords{$col2}{$chr2}->{end}=undef;
+ $strands{$col1}{$chr1}=undef;
+ $strands{$col2}{$chr2}=undef;
+ $ends_order{$col1}{$chr1}=undef;
+ $ends_order{$col2}{$chr2}=undef;
+
+
+ $order{$col1}{$chr1}->{index}->{1}=undef;
+ $order{$col1}{$chr1}->{index}->{2}=undef;
+ $order{$col2}{$chr2}->{index}->{1}=undef;
+ $order{$col2}{$chr2}->{index}->{2}=undef;
+ $order{$col1}{$chr1}->{order}=undef;
+ $order{$col2}{$chr2}->{order}=undef;
+
+ $record++;
+
+ for my $p (0..$#pairs){ #for each pair
+
+ my ($coord_start_read1,$coord_end_read1);
+ my ($coord_start_read2,$coord_end_read2);
+ my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format);
+ my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format);
+
+ if(!$diffchr){ #for a intrachromosomal link
+ if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column
+ ($col1,$col2)=(2,1);
+ }else{
+ ($col1,$col2)=(1,2);
+ }
+ }
+
+ push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads
+ push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1);
+ push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2);
+ push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2);
+ push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1);
+ push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2);
+ push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]);
+ push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]);
+ }
+
+ ($col1,$col2)=(1,2) if(!$diffchr);
+
+ my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region
+ my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}}));
+ my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}}));
+ my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}}));
+
+ @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}};
+ @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}};
+
+ foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting)
+ foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){
+
+ if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){
+ ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i;
+ ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j;
+ last;
+ }
+ }
+ }
+
+ foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID
+ foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){
+
+ if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){
+ ${$order{$col1}{$chr1}->{order}}[$i]=$i+1;
+ ${$order{$col2}{$chr2}->{order}}[$i]=$j+1;
+ last;
+ }
+ }
+ }
+
+ @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads
+ @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1});
+ @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2});
+ @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1});
+ @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2});
+ @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start});
+ @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start});
+
+
+ my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output
+ $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2,
+ scalar @pairs,
+ join(",",@pairs),
+ join(",",@{$strands{$col1}{$chr1}}),
+ join(",",@{$strands{$col2}{$chr2}}),
+ join(",",@{$ends_order{$col1}{$chr1}}),
+ join(",",@{$ends_order{$col2}{$chr2}}),
+ join(",",@{$order{$col1}{$chr1}->{order}}),
+ join(",",@{$order{$col2}{$chr2}->{order}}),
+ join(",",@{$coords{$col1}{$chr1}->{start}}),
+ join(",",@{$coords{$col2}{$chr2}->{start}}));
+
+ print $fh2 join("\t",@link)."\n";
+
+ if($record>=$warn){
+ print LOG "-- $fchr : $warn links processed\n";
+ $warn+=100000;
+ }
+ }
+ $fh->close;
+ $fh2->close;
+
+ print LOG "-- $fchr : Total : $record links processed\n";
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Sort links according the concerned chromosomes and their coordinates
+sub sortLinks{
+
+ my ($links_file,$sortedlinks_file,$unique)=@_;
+
+ my @sfile=split(/\./,$links_file);
+ my $fchr=$sfile[$#sfile-2];
+
+
+ print LOG "# $fchr : Sorting links...\n";
+
+ my $pipe=($unique)? "| sort -u":"";
+ system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file";
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#removal of fully overlapped links
+sub removeFullyOverlappedLinks{
+
+ my ($links_file,$nrlinks_file,$warn_out)=@_;
+
+ my %pt;
+ my $n=1;
+
+ my @sfile=split(/\./,$links_file);
+ my $fchr=$sfile[$#sfile-2];
+
+ my $fh = new FileHandle;
+
+ $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
+ while(<$fh>){
+
+ my @t=split("\t",$_);
+ $pt{$n}=[@t];
+ $n++;
+ }
+ $fh->close;
+
+ my $nb_links=$n-1;
+ my $nb=$nb_links;
+
+ my %pt2;
+ my $nb2=1;
+ my $record=0;
+ my $warn=10000;
+
+ print LOG "# $fchr : Removing fully overlapped links...\n";
+
+ LINK:
+
+ for my $i (1..$nb){
+
+ my @link=();
+ my @next_link=();
+ my $ind1=$i;
+
+ $record++;
+ if($record>=$warn){
+ print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n";
+ $warn+=10000;
+ }
+
+ if(exists $pt{$ind1}){
+ @link=@{$pt{$ind1}}; #link1
+ }else{
+ next LINK;
+ }
+
+ my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1
+ my @mates=deleteBadOrderSensePairs(split(",",$link[7]));
+
+ my $ind2=$ind1+1;
+ $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link
+
+ if($ind2<=$nb){
+
+ @next_link=@{$pt{$ind2}}; #link2
+ my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2
+ my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
+
+ while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter
+
+ if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2
+
+ $ind2++;
+ $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb);
+
+ if($ind2>$nb){ #if no more link in the file -> save link1
+
+ $pt2{$nb2}=\@link;
+ $nb2++;
+ next LINK;
+ }
+
+ @next_link=@{$pt{$ind2}};
+ ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]);
+ @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
+ next;
+ }
+
+ my %mates=map{$_ =>1} @mates; #get the equal number of mates
+ my @same_mates = grep( $mates{$_}, @next_mates );
+ my $nb_mates= scalar @same_mates;
+
+ if($nb_mates == scalar @mates){
+
+ delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1
+ next LINK; #go to link2, link2 becomes link1
+
+ }else{
+ delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2
+ $ind2++; #we continue by checking the next link2
+ $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb);
+
+ if($ind2>$nb){ #if no more link in the file -> save link1
+
+ $pt2{$nb2}=\@link;
+ $nb2++;
+ next LINK;
+ }
+
+ @next_link=@{$pt{$ind2}}; #get info of link2
+ ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]);
+ @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7]));
+
+ }
+ }
+ }
+ $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1
+ $nb2++;
+ }
+
+ print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n";
+
+ #OUTPUT
+
+ $fh = new FileHandle;
+ $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n";
+ print LOG "-- $fchr : writing...\n";
+ for my $i (1..$nb2-1){
+
+ print $fh join("\t",@{$pt2{$i}}); #all links output
+ }
+
+ close $fh;
+
+ print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out);
+
+ undef %pt;
+ undef %pt2;
+}
+#------------------------------------------------------------------------------#
+sub postFiltering {
+
+ my ($links_file,$pflinks_file, $finalScore_thres)=@_;
+
+ my @sfile=split(/\./,$links_file);
+ my $fchr=$sfile[$#sfile-2];
+
+
+ my ($nb,$nb2)=(0,0);
+
+ print LOG "# $fchr : Post-filtering links...\n";
+ print LOG "-- $fchr : final score threshold = $finalScore_thres\n";
+
+ my $fh = new FileHandle;
+ my $fh2 = new FileHandle;
+
+ $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
+ $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n";
+
+
+ while(<$fh>){
+
+ my @t=split("\t",$_);
+ my $score=$t[$#t-1];
+
+ if($score >= $finalScore_thres){
+ print $fh2 join("\t", @t);
+ $nb2++;
+ }
+ $nb++;
+ }
+ $fh->close;
+ $fh2->close;
+
+ print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n";
+ print LOG "-- $fchr : output created: $pflinks_file\n";
+}
+
+
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Filtering of the links
+sub strandFiltering{
+
+ my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes,
+ $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_;
+
+ my @sfile=split(/\./,$links_file);
+ my $fchr=$sfile[$#sfile-1];
+
+
+ my %chrs;
+ my %chrs1;
+ my %chrs2;
+ my $nb_chrs;
+ my $exclude;
+
+ if($chromosomes){
+ my @chrs=split(",",$chromosomes);
+ $nb_chrs=scalar @chrs;
+ $exclude=($chrs[0]=~/^\-/)? 1:0;
+ for my $chrName (@chrs){
+ $chrName=~s/^(\-)//;
+ my $col=($chrName=~s/_(1|2)$//);
+
+ if(!$col){
+ $chrs{$chrID->{$chrName}}=undef
+ }else{
+ $chrs1{$chrID->{$chrName}}=undef if($1==1);
+ $chrs2{$chrID->{$chrName}}=undef if($1==2);
+ }
+ }
+ }
+
+ my $record=0;
+ my $nb_links=0;
+ my $warn=10000;
+
+ my $sens_ratio_threshold=0.6;
+
+ print LOG "\# Filtering procedure...\n";
+ print LOG "\# Number of pairs and strand filtering...\n";
+ print LOG "-- file=$links_file\n";
+ print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no").
+ ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n";
+
+
+
+ my $fh = new FileHandle;
+ my $fh2 = new FileHandle;
+
+ $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
+ $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
+
+ while(<$fh>){
+
+ my @t=split; #for each link
+ my $is_good=1;
+ $record++;
+
+
+ if($chromosomes){
+
+ my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]});
+
+ if(!$exclude){
+ $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0;
+ $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good);
+ $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good);
+ }else{
+ $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1;
+ $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good);
+ }
+ }
+
+ $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs
+ if($is_good && $strand_filtering){ #if filtering according the strand sense
+
+ my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense
+ my @strands1=split(/,/,$t[8]);
+ my @strands2=split(/,/,$t[9]);
+
+ my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0);
+
+ my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR
+ 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF
+
+ my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE",
+ substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE",
+ inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE");
+
+ if($t[6] == 1){
+
+ push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1);
+
+ }else{
+
+ tie (my %class,'Tie::IxHash');
+ my $split;
+
+ foreach my $i (0..$#mates){
+ $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group
+ }
+
+ my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR};
+ my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF};
+ my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class);
+
+ if($sens_ratio < $sens_ratio_threshold){
+ %class=(1=>'FF', 2=>'FR');
+ $split=1;
+ }else{
+ $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class
+ $split=0;
+ }
+
+ $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold);
+ }
+ }
+
+ if($is_good){ #PRINT
+
+ my $nb=scalar @t;
+ if($nb > 20){
+ my @t2=splice(@t,0,20);
+ print $fh2 join("\t",@t2)."\n";
+ $nb_links++;
+ }
+ $nb_links++;
+ print $fh2 join("\t",@t)."\n";
+ }
+
+ if($record>=$warn){
+ print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n";
+ $warn+=10000;
+ }
+ }
+ $fh->close;
+ $fh2->close;
+
+ print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links);
+
+ print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
+
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getConsistentSenseLinks{
+
+ my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_;
+
+ my $npairs=scalar @$mates;
+
+ my @ends_order1 = split (/,/,$$t[10]);
+ my @ends_order2 = split (/,/,$$t[11]);
+ my @order1 = split (/,/,$$t[12]);
+ my @order2 = split (/,/,$$t[13]);
+ my @positions1 = split (/,/,$$t[14]);
+ my @positions2 = split (/,/,$$t[15]);
+
+ my @newlink;
+
+ foreach my $ind (keys %{$class} ){
+
+ tie (my %flink,'Tie::IxHash');
+ my @orders2remove=();
+
+ foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group
+
+ if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){
+ push(@{$flink{mates}},$$mates[$i]);
+ push(@{$flink{strands1}},$$strands1[$i]);
+ push(@{$flink{strands2}},$$strands2[$i]);
+ push(@{$flink{ends_order1}},$ends_order1[$i]);
+ push(@{$flink{ends_order2}},$ends_order2[$i]);
+ push(@{$flink{positions1}},$positions1[$i]);
+ push(@{$flink{positions2}},$positions2[$i]);
+
+ }else{
+ push(@orders2remove,$order1[$i]);
+ }
+ }
+
+ @{$flink{order1}}=();
+ @{$flink{order2}}=();
+ if(scalar @orders2remove > 0){
+ getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2})
+ }else{
+ @{$flink{order1}}=@order1;
+ @{$flink{order2}}=@order2;
+ }
+
+ my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length);
+ my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length);
+
+ my $fnpairs=scalar @{$flink{mates}};
+ my $strand_filtering_ratio=$fnpairs."/".$npairs;
+ my $real_ratio=$fnpairs/$npairs;
+
+ if($fnpairs>=$thres){ #filtering according the number of pairs
+
+ push(@newlink,
+ $$t[0],
+ min(min(@{$flink{positions1}}),min(@ends1)),
+ max(max(@{$flink{positions1}}),max(@ends1)),
+ $$t[3],
+ min(min(@{$flink{positions2}}),min(@ends2)),
+ max(max(@{$flink{positions2}}),max(@ends2)),
+ $fnpairs,
+ join(",",@{$flink{mates}}),
+ join(",",@{$flink{strands1}}),
+ join(",",@{$flink{strands2}}),
+ join(",",@{$flink{ends_order1}}),
+ join(",",@{$flink{ends_order2}}),
+ join(",",@{$flink{order1}}),
+ join(",",@{$flink{order2}}),
+ join(",",@{$flink{positions1}}),
+ join(",",@{$flink{positions2}}),
+ $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]},
+ $strand_filtering_ratio,
+ $real_ratio,
+ $npairs
+ );
+ }
+ }
+
+ if (grep {defined($_)} @newlink) {
+ @$t=@newlink;
+ return 1
+ }
+ return 0;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getNewOrders{
+
+ my($tab1,$tab2,$list,$newtab1,$newtab2)=@_;
+ my $j=1;
+ my $k=1;
+ for my $i (0..$#{$tab2}){
+ my $c=0;
+ for my $j (0..$#{$list}){
+ $c++ if(${$list}[$j] < ${$tab2}[$i]);
+ if(${$list}[$j] == ${$tab2}[$i]){
+ $c=-1; last;
+ }
+ }
+ if($c!=-1){
+ push(@{$newtab2}, ${$tab2}[$i]-$c);
+ push(@{$newtab1}, $k);
+ $k++;
+ }
+ }
+}
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#Filtering of the links using their order
+sub orderFiltering {
+
+ my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_;
+
+ my @sfile=split(/\./,$links_file);
+ my $fchr=$sfile[$#sfile-2];
+
+
+ my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0;
+
+ my $record=0;
+ my $warn=10000;
+ my $nb_links=0;
+
+ my $quant05 = 1.644854;
+ my $quant001 = 3.090232;
+ my $alphaDist = $quant05 * 2 * $sigma;
+ my $maxFragmentLength = &floor($quant001 * $sigma + $mu);
+
+ print LOG "\# Filtering by order...\n";
+ print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n";
+ print LOG "-- distance between comparable pairs was set to $alphaDist\n";
+ print LOG "-- maximal fragment length was set to $maxFragmentLength\n";
+
+
+ my $fh = new FileHandle;
+ my $fh2 = new FileHandle;
+
+ $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
+ $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
+
+ while(<$fh>){
+
+ $record++;
+ my @t = split;
+ my ($chr1,$chr2,$mates_list)=@t[0,3,7];
+ my @pairs=split(",",$mates_list);
+ ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2});
+ my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5];
+ my $numberOfPairs = $t[6];
+ my @strand1 = split (/,/,$t[8]);
+ my @strand2 = split (/,/,$t[9]);
+ my @ends_order1 = split (/,/,$t[10]);
+ my @ends_order2 = split (/,/,$t[11]);
+ my @order1 = split (/,/,$t[12]);
+ my @order2 = split (/,/,$t[13]);
+ my @positions1 = split (/,/,$t[14]);
+ my @positions2 = split (/,/,$t[15]);
+ my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length);
+ my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length);
+ my $clusterCoordinates_chr1;
+ my $clusterCoordinates_chr2;
+ my $reads_left = 0;
+
+ my $ifRenv = $t[16];
+ my $strand_ratio_filtering=$t[17];
+
+ #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if orientation is correct
+ my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense);
+ #find pairs type F-RRRR or FFFF-R in the case if orientation is correct
+ #These pairs are annotated as BED pairs forever! They won't be recycled!
+ my $table;
+ for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs
+ my $nonAdeq = 0;
+ for my $j (0..$i-1) {
+ if (exists($table->{$j}->{$i})) {
+ $nonAdeq++;
+ $table->{$i}->{$j} = 1;
+ }
+ }
+ for my $j ($i+1..$numberOfPairs-1) {
+ if ($positions1[$j]-$positions1[$i]>$alphaDist) {
+ if (&reversed ($i,$j,$ifRenv,\@positions2)) {
+ $nonAdeq++;
+ $table->{$i}->{$j} = 1;
+ }
+ }
+ }
+ $table->{$i}->{nonAdeq} = $nonAdeq;
+ }
+
+ for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of orientation
+ &remove($bad,$table);
+ }
+
+ my @falseReads;
+ #RRRR-F -> RRRR or R-FFFF -> FFFF
+ @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table});
+ #these pairs will be recycled later as $secondTable
+ for my $bad (@falseReads) {
+ &remove($bad,$table);
+ }
+
+ my $bad = &check($table);
+ while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER
+ # push (@falseReads, $bad); remove completely!!!
+ &remove($bad,$table);
+ $bad = &check($table);
+ }
+
+ $reads_left = scalar keys %{$table};
+ my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}]));
+ my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}]));
+ my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}]));
+ my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}]));
+
+ $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
+ $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
+
+ my $ifBalanced = 'UNBAL';
+ my $secondTable;
+ my $clusterCoordinates;
+ my ($break_pont_chr1,$break_pont_chr2);
+
+ my $signatureType="";
+
+ my $maxCoord1 =$chr->{$chr1}->{length};
+ my $maxCoord2 =$chr->{$chr2}->{length};
+
+ if (scalar @falseReads) {
+ @falseReads = sort @falseReads;
+ #now delete FRFR choosing the majority
+ my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF
+ @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled
+ my %hashTmp;
+ for my $count1 (0..scalar(@falseReads)-1) {
+ my $i = $falseReads[$count1];
+ $hashTmp{$i} = 1;
+ for my $bad (@newfalseReads) {
+ if ($bad == $i) {
+ delete $hashTmp{$i};
+ next;
+ }
+ }
+ }
+ @falseReads = sort keys %hashTmp; #what is left
+ for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected
+ my $nonAdeq = 0;
+ my $i = $falseReads[$count1];
+
+ for my $count2 (0..$count1-1) {
+ my $j = $falseReads[$count2];
+ if (exists($secondTable->{$j}->{$i})) {
+ $nonAdeq++;
+ $secondTable->{$i}->{$j} = 1;
+ }
+ }
+ for my $count2 ($count1+1..scalar(@falseReads)-1) {
+ my $j = $falseReads[$count2];
+ if ($positions1[$j]-$positions1[$i]>$alphaDist) {
+ if (&reversed ($i,$j,$ifRenv,\@positions2)) {
+ $nonAdeq++;
+ $secondTable->{$i}->{$j} = 1;
+ }
+ }
+ }
+ $secondTable->{$i}->{nonAdeq} = $nonAdeq;
+ }
+
+ my @falseReads2;
+ my $bad = &check($secondTable);
+ while ($bad ne "OK") { #clear the table to reject non adequate pairs
+ push (@falseReads2, $bad);
+ &remove($bad,$secondTable);
+ $bad = &check($secondTable);
+ }
+ if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) {
+ my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}]));
+ my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}]));
+ my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}]));
+ my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}]));
+
+ $ifBalanced = 'BAL';
+
+ if ($ifBalanced eq 'BAL') {
+
+ if (scalar keys %{$table} < $nb_pairs_order_threshold) {
+ $ifBalanced = 'UNBAL'; #kill cluster 1!
+ ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
+ $reads_left = scalar keys %{$table};
+ $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2;
+ $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2;
+ $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2;
+ $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2;
+ $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
+ $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
+
+ } else {
+
+ $reads_left += scalar keys %{$secondTable};
+ next if ($reads_left < $nb_pairs_threshold);
+
+ if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) {
+ ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
+
+ ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1);
+ ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1);
+ ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1);
+ ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1);
+
+ $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1;
+ $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2;
+ }else {
+ $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')';
+ $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')';
+ }
+ $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2);
+ $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2);
+ $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2);
+ $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2);
+ #to calculate breakpoints one need to take into account read orientation in claster..
+ my $leftLetterOk = substr($mate_sense, 0, 1); #R
+ my $rightLetterOk = substr($mate_sense, 1, 1); #F
+
+
+ my @index1 = keys %{$table};
+ my @index2 = keys %{$secondTable};
+
+ my (@generalStrand1,@generalStrand2) = 0;
+
+ if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
+ $leftLetterOk = 'R';
+ $rightLetterOk = 'F';
+ @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1);
+ @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2);
+ } else {
+ @generalStrand1 = @strand1;
+ @generalStrand2 = @strand2; # TODO check if it is correct
+ }
+ if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F)
+ $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')';
+
+ if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) {
+ if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
+ $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')';
+ $signatureType = "TRANSLOC";
+ } else {
+ $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')';
+ $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')';
+ $signatureType = "INS_FRAGMT";
+ }
+
+ } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) {
+ if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
+ $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')';
+ $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')';
+ $signatureType = "INV_INS_FRAGMT";
+ } else {
+ $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')';
+ $signatureType = "INV_TRANSLOC";
+ }
+ } else {
+ #should not occur
+ print STDERR "\nError in orderFiltering\n\n";
+ }
+ }
+
+ elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R)
+ $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')';
+ $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')';
+ if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) {
+ if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
+ $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')';
+ $signatureType = "INV_INS_FRAGMT";
+ } else {
+ $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')';
+ $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')';
+ $signatureType = "INV_COAMPLICON";
+ }
+
+ } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) {
+ if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) {
+ $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')';
+ $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')';
+ $signatureType = "COAMPLICON";
+ } else {
+ $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')';
+ $signatureType = "INS_FRAGMT";
+ }
+ } else {
+ #should not occur
+ $signatureType = "UNDEFINED";
+ }
+ }
+ else { # (F,F) or (R,R) something strange. We will discard the smallest cluster
+ $ifBalanced = 'UNBAL';
+ if (scalar keys %{$secondTable} > scalar keys %{$table}) {
+ ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2
+
+ $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2;
+ $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2;
+ $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2;
+ $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2;
+ $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')';
+ $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')';
+ }
+ $reads_left = scalar keys %{$table};
+ }
+ if ($ifBalanced eq 'BAL') {
+ $ifRenv = $signatureType;
+ }
+ }
+ }
+ }
+ }
+ if ($ifBalanced ne 'BAL') {
+ #define possible break point
+ $coord_start_chr1 = $coord_start_chr1_cluster1;
+ $coord_end_chr1 = $coord_end_chr1_cluster1;
+ $coord_start_chr2 = $coord_start_chr2_cluster1;
+ $coord_end_chr2 = $coord_end_chr2_cluster1;
+
+ my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1;
+ my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2;
+
+ my $leftLetterOk = substr($mate_sense, 0, 1); #R
+ my $rightLetterOk = substr($mate_sense, 1, 1); #F
+
+ my @index = keys %{$table};
+ unless ($diff_sense_ends) {
+ my $firstEndOrder1 = $ends_order1[$index[0]];
+ my $firstEndOrder2 = $ends_order2[$index[0]];
+ $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')';
+ $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')';
+ } else {
+ $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')';
+ $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')';
+ }
+
+ if ($chr1 ne $chr2){
+ $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE");
+ $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE");
+ }
+ }
+
+ if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) {
+ next; #discard the link
+ }
+ if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) {
+ next; #discard the link
+ }
+ my $ratioTxt = "$reads_left/".(scalar @pairs);
+ my ($n1,$nTot) = split ("/",$strand_ratio_filtering);
+ my $ratioReal = $reads_left/$nTot;
+
+ if ($coord_start_chr1<=0) {
+ $coord_start_chr1=1;
+ }
+ if ($coord_start_chr2<=0) {
+ $coord_start_chr2=1;
+ }
+ #create output
+ my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output
+ $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2,
+ $reads_left,
+ &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs),
+ &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1),
+ &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2),
+ &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1),
+ &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2),
+ &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1),
+ &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2),
+ &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1),
+ &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2),
+ $ifRenv,
+ $strand_ratio_filtering,
+ $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2,
+ $ratioReal, $nTot);
+
+ $nb_links++;
+ print $fh2 join("\t",@link)."\n";
+
+ if($record>=$warn){
+ print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n";
+ $warn+=10000;
+ }
+
+ }
+ $fh->close;
+ $fh2->close;
+
+ print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#gets information about ends positions given start, direction and order
+sub getEnds {
+ my ($ends,$starts,$strand,$end_order,$tag_length) = @_;
+ for my $i (0..scalar(@{$starts})-1) {
+ $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length);
+ }
+}
+sub getEnd {
+ my ($start,$strand, $end_order,$tag_length) = @_;
+ return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#gets starts and ends Coords when start=leftmost given positions, directions and orders
+sub getCoordswithLeftMost {
+
+ my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_;
+
+ for my $i (0..scalar(@{$positions})-1) {
+
+ if($strand->[$i] eq 'F'){
+ $starts->[$i]=$positions->[$i];
+ $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1;
+ }else{
+ $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1;
+ $ends->[$i]=$positions->[$i];
+ }
+ }
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering
+
+ my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_;
+
+ my @sfile=split(/\./,$links_file);
+ my $fchr=$sfile[$#sfile-2];
+
+
+ my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0;
+
+ my $record=0;
+ my $nb_links=0;
+ my $warn=10000;
+
+ print LOG "\# Filtering out normal pairs using insert size...\n";
+ print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n";
+ print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-".
+ ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n";
+ print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-".
+ ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n";
+ print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF");
+
+ my $fh = new FileHandle;
+ my $fh2 = new FileHandle;
+
+ $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n";
+ $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n";
+
+ while(<$fh>){
+
+ $record++;
+ my @t = split;
+ my ($chr1,$chr2,$mates_list)=@t[0,3,7];
+
+ if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all),
+ $nb_links++;
+
+ $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE");
+ $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE");
+
+ $t[16].= "\t";
+ $t[19].= "\t";
+
+ print $fh2 join("\t",@t)."\n";
+
+ if($record>=$warn){
+ print LOG "-- $fchr : $warn links processed - $nb_links links kept\n";
+ $warn+=10000;
+ }
+ next;
+ }
+
+ my $ifRenv = $t[16];
+ my $ifBalanced = "UNBAL";
+ $ifBalanced = $t[18] if ($order_filtering);
+
+ my $numberOfPairs = $t[6];
+ my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14]));
+ my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15]));
+
+ if ($ifBalanced eq "BAL") {
+
+ if ($ifRenv eq "INV_TRANSLOC") {
+ $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment
+ }
+ if ($ifRenv eq "NORMAL_SENSE") {
+ $ifRenv = "TRANSLOC";
+ }
+ if ($ifRenv eq "REVERSE_SENSE") {
+ $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment
+ }
+ $t[19].= "\t";
+
+ my $meanDistance = 0;
+
+ for my $i (0..$numberOfPairs-1) {
+ $meanDistance += $positions2[$i]-$positions1[$i];
+ }
+ $meanDistance /= $numberOfPairs;
+
+ $t[16] = $ifRenv."\t".$meanDistance;
+ #dont touch the annotation. It should be already OK.
+
+ } else {
+ #only for unbalanced
+
+ my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]);
+
+ my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0].
+ (deleteBadOrderSensePairs(split (/,/,$t[9])))[0];
+ my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0].
+ (deleteBadOrderSensePairs(split (/,/,$t[11])))[0];
+
+ my $indel_type = $ifRenv;
+
+ my $meanDistance = "N/A";
+
+ ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster
+ \@positions1, #assign N/A to $indel_type if unknown
+ \@positions2,
+ $ifRenv,
+ $ifoverlap,
+ $indel_sigma_threshold,
+ $dup_sigma_threshold,
+ $singleton_sigma_threshold,
+ $mu,
+ $sigma,
+ $ifBalanced,
+ $ends_sense_class,
+ $ends_order_class,
+ $mate_sense,
+ $diff_sense_ends,
+ );
+
+ #filtering of pairs with distance inconsistant with the SV
+ if ($ifRenv ne "REVERSE_SENSE") {
+ my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length};
+ my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length};
+ $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,
+ $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering);
+ next if ($t[6] < $nb_pairs_threshold);
+ }else{
+ $t[19].= "\t";
+ }
+ $t[16] = $indel_type."\t".$meanDistance;
+ }
+
+ $nb_links++;
+
+ print $fh2 join("\t",@t)."\n";
+ if($record>=$warn){
+ print LOG "-- $fchr : $warn links processed - $nb_links links kept\n";
+ $warn+=10000;
+ }
+ }
+ $fh->close;
+ $fh2->close;
+
+ print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n";
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub checkIndel {
+
+ my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold,
+ $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_;
+
+ my $meanDistance = 0;
+
+ for my $i (0..$numberOfPairs-1) {
+ $meanDistance += $positions2->[$i]-$positions1->[$i];
+ }
+ $meanDistance /= $numberOfPairs;
+
+ return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) );
+
+ return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE");
+
+ if($diff_sense_ends){
+ return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ;
+ return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense)));
+ }else{
+ return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) &&
+ ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ;
+ }
+
+ return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap);
+
+ return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ;
+
+ return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma);
+ return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma);
+
+ return ($meanDistance, "UNDEFINED");
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size )
+sub recalc_t_usingInsertSizeInfo {
+ my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_;
+
+ my @badPairs;
+
+ my @positions1 = getAllEntries($t->[14]);
+ my @positions2 = getAllEntries($t->[15]);
+
+ if ($meanDistance < $mu) {
+ for my $i (0..scalar(@positions1)-1) {
+ if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) {
+ push(@badPairs,$i);
+ }
+ }
+ } else {
+ for my $i (0..scalar(@positions1)-1) {
+ if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) {
+ push(@badPairs,$i);
+ }
+ }
+ }
+
+ if (scalar (@badPairs)>0) {
+ #print join("\t",@badPairs).": ".join("\t",@t)."\n";
+ #remove these inconsistant links
+ $t->[6] -= scalar(@badPairs); #numberOfPairs
+ return if ($t->[6] < $nb_pairs_threshold);
+
+ $t->[7] = mark_values(\@badPairs, $t->[7]);
+ $t->[8] = mark_values(\@badPairs, $t->[8]);
+ $t->[9] = mark_values(\@badPairs, $t->[9]);
+ $t->[10] = mark_values(\@badPairs, $t->[10]);
+ $t->[11] = mark_values(\@badPairs, $t->[11]);
+
+ $t->[12] = mark_indexes(\@badPairs, $t->[12]);
+ $t->[13] = mark_indexes(\@badPairs, $t->[13]);
+
+ $t->[14] = mark_values(\@badPairs, $t->[14]);
+ $t->[15] = mark_values(\@badPairs, $t->[15]);
+ $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio
+ $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering);
+ ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length);
+ ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length);
+
+ #recalc breakpoints:
+ my $quant001 = 3.090232;
+ my $maxFragmentLength = &floor($quant001 * $sigma + $mu);
+ $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering);
+ $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering);
+ #recalc total ratio
+ $t->[22] = $t->[6] / $t->[23] if ($order_filtering);
+ $t->[18] = $t->[6] / $t->[19] unless ($order_filtering);
+
+ @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14]));
+ @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15]));
+
+ $meanDistance = 0;
+
+ for my $i (0..scalar(@positions1)-1) {
+ $meanDistance += $positions2[$i]-$positions1[$i];
+ }
+ $meanDistance /= scalar(@positions1);
+
+ } else {
+ $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering);
+ $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering);
+
+ } #nothing has been filtered
+ return $meanDistance;
+}
+
+sub recalculate_ratio {
+ my ($left, $ratio) = @_;
+ my @elements = split (/\//,$ratio);
+ $elements[1]= $elements[0];
+ $elements[0]=$left;
+ return $ratio."\t".join("/",@elements);
+}
+
+sub recalc_breakpoints {
+ my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_;
+ my $break_pont_chr;
+
+ my $leftLetterOk = substr($mate_sense, 0, 1); #R
+ my $rightLetterOk = substr($mate_sense, 1, 1); #F
+
+
+ my @positions = deleteBadOrderSensePairs(split (/,/,$startString));
+
+ unless ($diff_sense_ends) {
+ $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')';
+ } else {
+ $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')';
+ }
+ return $break_pont_chr;
+}
+sub recalculate_boundaries {
+ my ($startString,$senseString,$endsOrderString,$tag_length) = @_;
+ my @positions = deleteBadOrderSensePairs(split (/,/,$startString));
+ my @strands = deleteBadOrderSensePairs(split (/,/,$senseString));
+ my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString));
+ my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length);
+ my $coord_start_cluster = min(min(@positions),min(@ends));
+ my $coord_end_cluster = max(max(@positions),max(@ends));
+ return ($coord_start_cluster,$coord_end_cluster);
+}
+
+sub remove_indexes {
+ my ($bads, $string) = @_;
+ my @elements = deleteBadOrderSensePairs(split (/,/,$string));
+ for my $i (reverse sort %{$bads}) {
+ delete $elements[$i];
+ }
+ return "(".join(",",@elements).")";
+}
+##add @ to to elements
+sub mark_values {
+ my ($bads, $string) = @_;
+ my @elements = getAllEntries($string);
+ for my $i (@{$bads}) {
+ $elements[$i] .= "@";
+ }
+ return "(".join(",",@elements).")";
+}
+##add @ to to indexes
+sub mark_indexes {
+ my ($bads, $string) = @_;
+ my @elements = getAllEntries($string);
+ for my $i ((0..scalar(@elements)-1)) {
+ for my $j (@{$bads}) {
+ $elements[$i] .= "@" if ($elements[$i] eq ($j+1));
+ }
+ }
+
+ return "(".join(",",@elements).")";
+}
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub redraw {
+
+ my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_;
+
+ my $out;
+ my @first_arr;
+ if ($ifBalanced eq 'BAL') {
+ my @second_arr;
+ my $lastPushed = 1;
+ if ($type == 1) {
+ for my $i (0 .. scalar(@{$arr})-1) {
+ if (exists ($table->{$i})) {
+ push(@first_arr,$arr->[$i]);
+ $lastPushed = 1;
+ }elsif (exists ($secondTable->{$i})) {
+ push(@second_arr,$arr->[$i]);
+ $lastPushed = 2;
+ } elsif ($lastPushed == 1) {
+ if (exists ($badInFRSense->{$i})) {
+ push(@first_arr,$arr->[$i]."\$");
+ }else {
+ push(@first_arr,$arr->[$i]."*");
+ }
+ } elsif ($lastPushed == 2) {
+ if (exists ($badInFRSense->{$i})) {
+ push(@second_arr,$arr->[$i]."\$");
+ }else {
+ push(@second_arr,$arr->[$i]."*");
+ }
+ } else {print "Error!";exit;}
+ }
+ } else {
+ for my $i (@{$arr}) {
+ if (exists ($table->{$i-1})) {
+ push(@first_arr,$i);
+ $lastPushed = 1;
+ }elsif (exists ($secondTable->{$i-1})) {
+ push(@second_arr,$i);
+ $lastPushed = 2;
+ } elsif ($lastPushed == 1) {
+ if (exists ($badInFRSense->{$i-1})) {
+ push(@first_arr,$i."\$");
+ }else {
+ push(@first_arr,$i."*");
+ }
+ } elsif ($lastPushed == 2) {
+ if (exists ($badInFRSense->{$i-1})) {
+ push(@second_arr,$i."\$");
+ }else {
+ push(@second_arr,$i."*");
+ }
+ } else {print "Error!";exit;}
+ }
+ }
+ $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')';
+ }
+ else {
+ if ($type == 1) {
+ for my $i (0 .. scalar(@{$arr})-1) {
+ if (exists ($table->{$i})) {
+ push(@first_arr,$arr->[$i]);
+ } else {
+ if (exists ($badInFRSense->{$i})) {
+ push(@first_arr,$arr->[$i]."\$");
+ }else {
+ push(@first_arr,$arr->[$i]."*");
+ }
+ }
+ }
+ } else {
+ for my $i (@{$arr}) {
+ if (exists ($table->{$i-1})) {
+ push(@first_arr,$i);
+ } else {
+ if (exists ($badInFRSense->{$i-1})) {
+ push(@first_arr,$i."\$");
+ }else {
+ push(@first_arr,$i."*");
+ }
+ }
+ }
+ }
+ $out = '('.join(",",@first_arr).')';
+ }
+ return $out;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub check {
+
+ my $table = $_[0];
+ my $bad = 'OK';
+ my $max = 0;
+ for my $i (sort {$a<=>$b} keys %{$table}) {
+ unless ($table->{$i}->{nonAdeq} == 0) {
+ if ($max<$table->{$i}->{nonAdeq}) {
+ $max=$table->{$i}->{nonAdeq};
+ $bad = $i;
+ }
+ }
+ }
+ return $bad;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub reversed {
+
+ my ($i,$j,$ifRenv,$positions) = @_;
+ if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){
+ return 1;
+ }
+ return 0;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub remove {
+
+ my ($bad,$table) = @_;
+ for my $i (sort {$a<=>$b} keys %{$table}) {
+ if ($bad == $i) {
+ delete($table->{$i});;
+ } else {
+ if (exists($table->{$i}->{$bad})) {
+ delete($table->{$i}->{$bad});
+ $table->{$i}->{nonAdeq}--;
+ }
+ }
+ }
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs
+
+ my ($strand,$ends_order,$mate_sense,@keysLeft) = @_;
+
+ my $leftLetterOk = substr($mate_sense, 0, 1); #R
+ my $rightLetterOk = substr($mate_sense, 1, 1); #F
+
+ my (@standardArray);
+ if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
+ $leftLetterOk = 'R';
+ $rightLetterOk = 'F';
+ @standardArray = translateSolidToRF($strand,$ends_order);
+ } else {
+ @standardArray = @{$strand};
+ }
+
+ my $ifR = 0;
+ my @Rs;
+
+ for my $i (@keysLeft) {
+ if ($standardArray[$i] eq $leftLetterOk) {
+ $ifR++;
+ push(@Rs,$i);
+ }
+ }
+
+
+ my $ifF = 0;
+ my @Fs;
+
+ for my $i (@keysLeft) {
+ if ($standardArray[$i] eq $rightLetterOk) {
+ $ifF++;
+ push(@Fs,$i);
+ }
+ }
+
+ if($ifR>=$ifF) {
+ return @Fs;
+ }
+ return @Rs;
+}
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa
+
+ my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]);
+ my $mate_sense = $_[6];
+
+ my $leftLetterOk = substr($mate_sense, 0, 1); #R
+ my $rightLetterOk = substr($mate_sense, 1, 1); #F
+
+ my (@standardArray1,@standardArray2);
+
+ if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs
+ $leftLetterOk = 'R';
+ $rightLetterOk = 'F';
+ @standardArray1 = translateSolidToRF($strand1,$ends_order1);
+ my @arr = getOrderedStrands($strand2,$order2);
+ my @ends2 = getOrderedStrands($ends_order2,$order2);
+ @standardArray2 = translateSolidToRF(\@arr,\@ends2);
+
+ } else {
+ @standardArray1 = @{$strand1};
+ @standardArray2 = getOrderedStrands($strand2,$order2);
+ }
+
+ #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF
+
+ #for the first end:
+
+ my @array = @standardArray1;
+ my %badInFRSense1;
+ for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF
+ if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
+ $badInFRSense1{$i}=1;
+ $array[$i] = $rightLetterOk;
+ }
+ }
+ my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1);
+ @array = @standardArray1;
+ my %badInFRSense0;
+ for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR
+ if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
+ $badInFRSense0{$i-1}=1;
+ $array[$i-1] = $leftLetterOk;
+
+ }
+ }
+ my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0);
+
+ #for the second end:
+ @array = @standardArray2;
+
+ my %badInFRSense3;
+ for my $i (1..scalar(@array)-1){
+ if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
+ $badInFRSense3{$order2->[$i]}=1;
+ $array[$i] = $rightLetterOk;
+ }
+ }
+ my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3);
+
+ @array = @standardArray2;
+ my %badInFRSense5;
+ for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF
+ if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) {
+ $badInFRSense5{$i-1}=1;
+ $array[$i-1] = $leftLetterOk;
+ }
+ }
+ my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5);
+
+ if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) {
+ return (1,%badInFRSense0);
+ }
+
+ if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) {
+ return (1,%badInFRSense1);
+ }
+
+ if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) {
+ return (2,%badInFRSense3);
+ }
+
+ if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) {
+ return (2,%badInFRSense5);
+ }
+
+ #should not get here:
+ print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n";
+ return (1,%badInFRSense1);
+}
+
+sub getOrderedStrands {
+ my ($strand,$order) = ($_[0],$_[1]);
+ my @arr;
+ for my $i (0..scalar(@{$strand})-1) {
+ push(@arr,$strand->[$order->[$i]-1]);
+ }
+ return @arr;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub checkClusters {
+
+ my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_;
+ if ($ifRenv eq 'REVERSE_SENSE') {
+ if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) {
+ return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0;
+ }
+ return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0;
+ }
+ #if NORM
+ if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) {
+ return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0;
+ }
+ return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0;
+}
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub translateSolidToRF {
+ my ($strandArr,$ends_orderArr)=@_;
+ my @array;
+ for my $i (0..scalar(@{$strandArr})-1) {
+ if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') {
+ push(@array,'F');
+ }
+ if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') {
+ push(@array,'R');
+ }
+ if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') {
+ push(@array,'R');
+ }
+ if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') {
+ push(@array,'F');
+ }
+ }
+ return @array;
+}
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#convert the links file to the circos format
+sub links2segdup{
+
+ my($id,$color_code,$links_file,$segdup_file)=@_;
+
+ print LOG "# Converting to the circos format...\n";
+
+ tie (my %hcolor,'Tie::IxHash'); #color-code hash table
+ foreach my $col (keys %{$color_code}){
+ my ($min_links,$max_links)=split(",",$color_code->{$col});
+ $hcolor{$col}=[$min_links,$max_links];
+ }
+
+ open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n";
+ open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n";
+
+ my $index=1;
+ while(){
+
+ my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6];
+
+ my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links
+
+ print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output
+ "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n";
+ $index++;
+ }
+
+ close LINKS;
+ close SEGDUP;
+ print LOG "-- output created: $segdup_file\n";
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#convert the links file to the bedPE format for BEDTools usage
+sub links2bedPElinksfile{
+
+ my ($sample,$links_file,$bedpe_file)=@_;
+
+ open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n";
+ open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n";
+
+ my $nb_links=1;
+
+ while(){
+
+ chomp;
+ my @t=split("\t",$_);
+ my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6);
+ my $type=($chr1 eq $chr2)? "INTRA":"INTER";
+ $type.="_".$t[10];
+
+ $start1--; $start2--;
+
+ print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2".
+ "\t$sample"."_link$nb_links\t$type\t.\t.".
+ "\t".join("|",@t)."\n";
+
+ $nb_links++;
+ }
+
+ close LINKS;
+ close BEDPE;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub bedPElinks2linksfile{
+
+ my ($bedpe_file,$links_file)=@_;
+
+ open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n";
+ open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n";
+
+ while(){
+
+ chomp;
+ my $sample=(split("_",(split("\t",$_))[6]))[0];
+ my @t1=(split("\t",$_))[0,1,2,3,4,5];
+ my @t2=split(/\|/,(split("\t",$_))[10]);
+ push(@t2,$sample);
+
+ print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n";
+
+ }
+ close BEDPE;
+ close LINKS;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#convert the links file to the bed format
+sub links2bedfile{
+
+ my ($tag_length,$color_code,$links_file,$bed_file)=@_;
+
+ print LOG "# Converting to the bed format...\n";
+
+ my $compare=1;
+ if($links_file!~/compared$/){
+ $compare=0;
+ $tag_length->{none}->{1}=$tag_length->{1};
+ $tag_length->{none}->{2}=$tag_length->{2};
+ }
+
+ #color-code hash table
+ tie (my %hcolor,'Tie::IxHash');
+ my %color_order;
+ my $n=1;
+ foreach my $col (keys %{$color_code}){
+ my ($min_links,$max_links)=split(",",$color_code->{$col});
+ $hcolor{$col}=[$min_links,$max_links];
+ $color_order{$col}=$n;
+ $n++;
+ }
+
+ my %pair;
+ my %pt;
+ $n=1;
+ open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n";
+
+ my %str=( "F"=>"+", "R"=>"-" );
+
+ while(){
+
+ my @t=split;
+ my $sample=($compare)? pop(@t):"none";
+
+ my $chr1=$t[0];
+ my $chr2=$t[3];
+ $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i);
+ $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i);
+ my $same_chr=($chr1 eq $chr2)? 1:0;
+
+ my $count=$t[6];
+ my $color=getColor($count,\%hcolor,"bed");
+
+ my @pairs=deleteBadOrderSensePairs(split(",",$t[7]));
+ my @strand1=deleteBadOrderSensePairs(split(",",$t[8]));
+ my @strand2=deleteBadOrderSensePairs(split(",",$t[9]));
+ my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10]));
+ my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11]));
+ my @position1=deleteBadOrderSensePairs(split(",",$t[14]));
+ my @position2=deleteBadOrderSensePairs(split(",",$t[15]));
+ my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample});
+ my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample});
+
+
+ for my $p (0..$#pairs){
+
+ if (!exists $pair{$pairs[$p]}){
+
+ if($same_chr){
+
+ $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]},
+ $start1[$p]-1, $end2[$p], $color,
+ 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ];
+ $pt{$n}=$pair{$pairs[$p]}->{0};
+ $n++;
+
+ }else{
+
+ $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]},
+ $start1[$p]-1, $end1[$p], $color,
+ 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0];
+ $pt{$n}=$pair{$pairs[$p]}->{1};
+ $n++;
+
+
+ $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]},
+ $start2[$p]-1, $end2[$p], $color,
+ 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0];
+ $pt{$n}=$pair{$pairs[$p]}->{2};
+ $n++;
+ }
+ }else{
+
+ if($same_chr){
+ ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]});
+ }else{
+ ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]});
+ ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]});
+ }
+ }
+ }
+ }
+ close LINKS;
+
+ my $nb_pairs=$n-1;
+
+ open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n";
+ print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ".
+ "visibility=2 itemRgb=\"On\"\n";
+
+ for my $i (1..$nb_pairs){
+ print BED join("\t",@{$pt{$i}})."\n";
+ }
+
+ close BED;
+
+ print LOG "-- output created: $bed_file\n";
+
+ undef %pair;
+ undef %pt;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub deleteBadOrderSensePairs{
+
+ my (@tab)=@_;
+ my @tab2;
+
+ foreach my $v (@tab){
+
+ $v=~s/[\(\)]//g;
+ push(@tab2,$v) if($v!~/[\$\*\@]$/);
+ }
+ return @tab2;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getAllEntries{
+ my (@tab)=split (/,/,$_[0]);
+ my @tab2;
+
+ foreach my $v (@tab){
+
+ $v=~s/[\(\)]//g;
+ push(@tab2,$v);
+ }
+ return @tab2;
+}#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getAllEntriesWOspecialChar{
+ my (@tab)=split (/,/,$_[0]);
+ my @tab2;
+
+ foreach my $v (@tab){
+
+ $v=~s/[\(\)\$\*\@]//g;
+ push(@tab2,$v);
+ }
+ return @tab2;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub links2SVfile{
+
+ my($links_file,$sv_file)=@_;
+
+ print LOG "# Converting to the sv output table...\n";
+ open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n";
+ open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n";
+
+ my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist
+ chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering
+ final_score breakpoint1_start1-end1 breakpoint2_start2-end2);
+
+ my $nb_links=0;
+
+ while (){
+
+ my @t=split;
+ my @sv=();
+ my $sv_type="-";
+ my $strand_ratio="-";
+ my $eq_ratio="-";
+ my $eq_type="-";
+ my $insert_ratio="-";
+ my $link="-";
+ my ($bk1, $bk2)=("-","-");
+ my $score="-";
+
+ my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]);
+ my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]);
+ my $nb_pairs=$t[6];
+ $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i);
+ $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i);
+ my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER";
+
+ #if strand filtering
+ if (defined $t[16]){
+ #if inter-chr link
+ $sv_type=$t[16];
+ if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){
+ $strand_ratio=floor($1/$2*100)."%";
+ $score=$t[18];
+ }
+ if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){
+ #if intra-chr link with insert size filtering
+ $strand_ratio=floor($1/$2*100)."%";
+ $link=floor($t[17]);
+ if($sv_type!~/^INV/){
+ $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/);
+ $score=$t[20];
+ }else{
+ $score=$t[19];
+ }
+ }
+ }
+
+ if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){
+
+ #if strand and order filtering only and/or interchr link
+ $eq_type=$t[18];
+ $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/);
+ ($bk1, $bk2)=($t[20],$t[21]);
+ foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;}
+ $score=$t[22];
+
+ }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){
+
+ #if all three filtering
+ $link=floor($t[17]);
+ $eq_type=$t[19];
+ $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/);
+
+ if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){
+ $insert_ratio=floor($1/$2*100)."%";
+ ($bk1, $bk2)=($t[22],$t[23]);
+ $score=$t[24];
+
+ }else{
+ ($bk1, $bk2)=($t[21],$t[22]);
+ $score=$t[23];
+ }
+ foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;}
+
+ }
+
+
+ push(@sv, $chr_type, $sv_type,$eq_type);
+ push(@sv,"$chr1\t$start1-$end1");
+ push(@sv, $link);
+ push(@sv,"$chr2\t$start2-$end2",
+ $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2);
+
+
+ print SV join("\t",@sv)."\n";
+ }
+
+ close LINKS;
+ close SV;
+
+ system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted";
+
+ open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n";
+ my @links=;
+ close SV;
+
+ open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n";
+
+ print SV join("\t",@header)."\n";
+ print SV @links;
+ close SV;
+
+ unlink($sv_file.".sorted");
+
+ print LOG "-- output created: $sv_file\n";
+
+}
+#------------------------------------------------------------------------------#
+sub densityCalculation{
+
+ my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_;
+
+ my @sfile=split(/\./,$$mates_file[$file]);
+ my $fchr=$sfile[$#sfile];
+
+ my $fh = new FileHandle;
+
+ my %density;
+ my %density_ref;
+ my @ratio;
+ my ($cov,$cov_ref);
+
+ #FREQUENCY CALCULATION PROCEDURE
+ print LOG "# $fchr : Frequency calculation procedure...\n";
+ &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format);
+ &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format);
+
+ #MAKING RATIO AND OUTPUT
+ print LOG "\# Ratio calculation procedure...\n";
+ $density_file=~s/\/mates\//\/density\//;
+ $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n";
+
+ foreach my $k (1..$chr->{nb_chrs}){
+ foreach my $frag (1..$chr->{$k}->{nb_frag}){
+
+ @ratio= ($chr->{$k}->{name},
+ (${$chr->{$k}->{$frag}}[0]+1),
+ (${$chr->{$k}->{$frag}}[1]+1));
+
+ $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0;
+ $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0;
+
+ push(@ratio,$cov,$cov_ref);
+ push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref);
+ push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref);
+ push(@ratio,log($cov+1)) if($cov && !$cov_ref);
+ next if(!$cov && !$cov_ref);
+
+ print $fh join("\t",@ratio)."\n";
+ }
+ }
+
+ $fh->close;
+ print LOG "-- output created: $density_file\n";
+
+ undef %density;
+ undef %density_ref;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub FreqCalculation{
+
+ my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_;
+
+ my @sfile=split(/\./,$mates_file);
+ my $fchr=$sfile[$#sfile];
+ my $fh = new FileHandle;
+
+ my $nb_windows=0;
+ my $warn=100000;
+ my $record=0;
+ my %pair;
+
+ my ($sumX,$sumX2) = (0,0);
+
+ print LOG "\# Frequency calculation for $mates_file...\n";
+
+ if ($mates_file =~ /.gz$/) {
+ $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";
+ }elsif($mates_file =~ /.bam$/){
+ o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY
+ }else{
+ $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n";
+ }
+
+ while(<$fh>){
+
+ my @t=split;
+ my $mate=$t[0];
+
+ my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,);
+
+ next if(exists $pair{$mate});
+
+ next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length));
+
+ next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2});
+ ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2});
+
+ $pair{$mate}=undef;
+ $record++;
+
+ my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2);
+
+ recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format);
+ recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format);
+
+ my $length = abs($coord_start_read1-$coord_start_read2);
+ $sumX += $length; #add to sum and sum^2 for mean and variance calculation
+ $sumX2 += $length*$length;
+
+ for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){
+
+ if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){
+
+ &addToDensity($density,$chr_read1,$i,\$nb_windows)
+ if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1]));
+
+ }else{
+
+ $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step);
+ }
+ }
+
+ if($record>=$warn){
+ print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n";
+ $warn+=100000;
+ }
+ }
+ $fh->close;
+
+ print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n";
+
+ if($record>0){
+
+ my $mu = $sumX/$record;
+ my $sigma = sqrt($sumX2/$record - $mu*$mu);
+ print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n";
+ }
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub ratio2segdup{
+
+ my($id,$density_file,$segdup_file)=@_;
+
+ print LOG "# Converting to circos format...\n";
+
+ open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n";
+ open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n";
+
+ while(){
+ chomp;
+ my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5];
+ print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n";
+ }
+
+ close RATIO;
+ close SEGDUP;
+ print LOG "-- output created: $segdup_file\n";
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub ratio2bedfile{
+
+ my($density_file,$bed_file)=@_;
+
+ print LOG "# Converting to bedGraph format...\n";
+
+ open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n";
+ open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n";
+ print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ".
+ "visibility=2 color=255,0,0 alwaysZero=\"On\"\n";
+
+ while(){
+ chomp;
+ my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5];
+ $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/);
+ print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n";
+ }
+
+ close RATIO;
+ close BED;
+ print LOG "-- output created: $bed_file\n";
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub inverseSense{
+
+ my $mate_sense=$_[0];
+ my %reverse=( 'F' => 'R' , 'R' => 'F' ,
+ 'FF' => 'RR', 'RR' => 'FF',
+ 'FR' => 'RF', 'RF' => 'FR');
+ return $reverse{$mate_sense};
+}
+
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getNextFrag{
+
+ my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_;
+
+ my $how_far = $read_start-$frag_start;
+ my $nb_windows_toskip;
+
+ if($how_far>0){
+
+ $nb_windows_toskip=($how_far/$step)-($window_dist/$step);
+ $nb_windows_toskip=~ s/\..*//;
+ $nb_windows_toskip=0 if($nb_windows_toskip<0);
+ return ($frag_num + $nb_windows_toskip);
+ }
+ return $frag_last;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getColor{
+
+ my($count,$hcolor,$format)=@_;
+ for my $col ( keys % { $hcolor} ) {
+ return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]);
+ }
+ return "white" if($format eq "circos");
+ return "255,255,255" if($format eq "bed");
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub recupCoords{
+
+ my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_;
+ my $strand = 'F';
+
+ if ($c_hit=~s/^\-//) {
+ $strand='R';
+ $$cs_hit=$c_hit;
+ $$ce_hit=$c_hit-($tag_length-1);
+ }else{
+ $$cs_hit=$c_hit;
+ $$ce_hit=$c_hit+($tag_length-1);
+ }
+ return $strand;
+
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub overlap {
+ my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_;
+ if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) {
+ return 0;
+ }
+ return 1;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub makeLink {
+
+ my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_;
+
+ if($chr1>$chr2){
+ ($chr1,$chr2)= ($chr2,$chr1);
+ ($frag1,$frag2)= ($frag2,$frag1);
+ }
+
+ if($chr1 == $chr2){
+ if($frag1>$frag2){
+ ($frag1,$frag2)= ($frag2,$frag1);
+ }
+ }
+
+ if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){
+ $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt;
+ $$nb++;
+ }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){
+ $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt";
+ }
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#fonction of adding the read to the density profile
+sub addToDensity {
+
+ my ($density,$chr1,$frag1,$nb)=@_;
+
+ if(!exists $density->{$chr1}->{$frag1}->{count}){
+ $density->{$chr1}->{$frag1}->{count}=1;
+ $$nb++;
+ }else{
+ $density->{$chr1}->{$frag1}->{count}++;
+ }
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub floor {
+ my $nb = $_[0];
+ $nb=~ s/\..*//;
+ return $nb;
+}
+#------------------------------------------------------------------------------#
+sub decimal{
+
+ my $num=shift;
+ my $digs_to_cut=shift;
+
+ $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/);
+
+ return $num;
+}
+
+#------------------------------------------------------------------------------#
+sub max {
+
+ my($max) = shift(@_);
+ foreach my $temp (@_) {
+ $max = $temp if $temp > $max;
+ }
+ return($max);
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub min {
+
+ my($min) = shift(@_);
+ foreach my $temp (@_) {
+ $min = $temp if $temp < $min;
+ }
+ return($min);
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub sortTablebyIndex{
+ my ($tab1,$tab2)=@_;
+ my @tab3;
+
+ foreach my $i (@$tab1){
+ $tab3[$i]=$$tab2[$$tab1[$i]];
+ }
+ return @tab3;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub round {
+ my $number = shift || 0;
+ my $dec = 10 ** (shift || 0);
+ return int( $dec * $number + .5 * ($number <=> 0)) / $dec;
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub getUniqueTable{
+
+ my (@tab)=@_;
+ my (%saw,@out)=();
+ undef %saw;
+ return sort(grep(!$saw{$_}++, @tab));
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+sub catFiles {
+
+ unlink("$_[1]") if(exists $_[1]);
+ system qq( cat "$_" >> "$_[1]" ) for @{$_[0]};
+}
+#------------------------------------------------------------------------------#
+#------------------------------------------------------------------------------#
+#check if the configuration file is correct
+sub validateconfiguration{
+
+ my %conf=%{$_[0]};
+ my $list_prgs="@ARGV";
+
+ my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file);
+ my @detection_params=qw(split_mate_file window_size step_length split_mate_file);
+ my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file);
+ my @circos_params=qw(organism_id colorcode);
+ my @bed_params=qw(colorcode);
+ my @compare_params=qw(list_samples file_suffix);
+
+ foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){
+
+ unless (defined($dir)) {
+ $dir = ".";
+ }
+ unless (-d $dir){
+ mkdir $dir or die;
+ }
+ $dir.="/" if($dir!~/\/$/);
+ }
+
+ unless (defined($conf{general}{num_threads})) {
+ $conf{general}{num_threads} = 1;
+ }
+ $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24);
+
+ if($list_prgs!~/links2compare/){
+
+ foreach my $p (@general_params){
+ die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p});
+ }
+
+ $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam");
+
+ unless (defined($conf{general}{sv_type})) {
+ $conf{general}{sv_type} = "all";
+ }
+
+ $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}};
+ }
+
+ if($list_prgs=~/(linking|cnv)/){
+
+ foreach my $p (@detection_params){
+ die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p});
+ }
+
+ die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref});
+
+ if($conf{detection}{step_length}>$conf{detection}{window_size}){
+ die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n");
+ }
+
+ unless (-d $conf{general}{tmp_dir}."/mates"){
+ mkdir $conf{general}{tmp_dir}."/mates" or die;
+ }
+
+ if($list_prgs=~/linking/){
+ unless (-d $conf{general}{tmp_dir}."/links"){
+ mkdir $conf{general}{tmp_dir}."/links" or die;
+ }
+ }
+ if($list_prgs=~/cnv/){
+ unless (-d $conf{general}{tmp_dir}."/density"){
+ mkdir $conf{general}{tmp_dir}."/density" or die;
+ }
+ }
+
+ }
+
+ if($list_prgs=~/filtering/){
+
+ foreach my $p (@filtering_params) {
+ die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p});
+
+ }
+
+ if(defined($conf{filtering}{chromosomes})) {
+ my @chrs=split(",",$conf{filtering}{chromosomes});
+ my $exclude=($chrs[0]=~/^\-/)? 1:0;
+ for my $chrName (@chrs){
+
+ die("Error Config : The filtering parameter \"chromosomes\" is not valid\n")
+ if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude));
+
+ }
+ }
+
+ if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) {
+ die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected".
+ "\nChange strand_filtering to \"1\" if you want to use the order filtering\n");
+ }
+ if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) {
+ die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n");
+ }
+ if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) {
+ die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected".
+ "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n");
+ }
+ if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) {
+ die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n");
+ }
+
+ if (!defined($conf{filtering}{indel_sigma_threshold})) {
+ $conf{filtering}{indel_sigma_threshold} = 2;
+ }
+ if (!defined($conf{filtering}{dup_sigma_threshold})) {
+ $conf{filtering}{dup_sigma_threshold} = 2;
+ }
+ if (!defined($conf{filtering}{singleton_sigma_threshold})) {
+ $conf{filtering}{singleton_sigma_threshold} = 4;
+ }
+
+ if (!defined($conf{filtering}{nb_pairs_order_threshold})) {
+ $conf{filtering}{nb_pairs_order_threshold} = 1;
+ }
+
+ if (!defined($conf{filtering}{final_score_threshold})) {
+ $conf{filtering}{final_score_threshold} = 0.8;
+ }
+
+ if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) {
+ die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n");
+ }
+
+ }
+
+ if($list_prgs=~/2circos$/){
+ foreach my $p (@circos_params) {
+ next if($list_prgs=~/^ratio/ && $p eq "colorcode");
+ die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p});
+ }
+ }
+
+ if($list_prgs=~/2bed$/){
+ foreach my $p (@bed_params) {
+ die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p});
+ }
+ }
+
+ if($list_prgs=~/links2compare/){
+ foreach my $p (@compare_params) {
+ die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p});
+ }
+
+ unless (defined($conf{compare}{same_sv_type})) {
+ $conf{compare}{same_sv_type} = 0;
+ }
+
+ unless (defined($conf{compare}{min_overlap})) {
+ $conf{compare}{min_overlap} = 1E-9;
+ }
+
+ if($conf{compare}{circos_output}){
+ foreach my $p (@circos_params) {
+ next if($list_prgs=~/^ratio/ && $p eq "colorcode");
+ die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p});
+ }
+ }
+ if($conf{compare}{bed_output}){
+ foreach my $p (@bed_params) {
+ die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p});
+ }
+ die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths});
+
+ my @samples=split(",",$conf{compare}{list_samples});
+ my @read_lengths=split(",",$conf{compare}{list_read_lengths});
+ for my $i (0..$#samples){
+ my @l=split("-",$read_lengths[$i]);
+ $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]};
+ }
+ }
+ }
+
+
+}
+#------------------------------------------------------------------------------#
+#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/SVDetect_run_parallel.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/SVDetect_run_parallel.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,324 @@
+
+
+and identify structural variants
+
+SVDetect_run_parallel.pl
+
+#if $getLinks.linking == "linking"
+linking
+
+#end if
+#if $getFilteredLinks.filtering == "filtering"
+filtering
+
+#if str($getFilteredLinks.links2SV) == "create"
+links2SV
+-out3 '$sv_file'
+#end if
+#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2circos) == "create"
+links2circos
+-out4 '$circos_file'
+#end if
+#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2bed) == "create"
+links2bed
+-out5 '$bed_file'
+#end if
+#end if
+-conf '$config_file'
+-l '$log_file'
+-N '$sample_name'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (
+ getFilteredLinks['filtering']=="filtering" and
+ getFilteredLinks['links2SV'] is True
+ )
+
+
+
+ (
+ getFilteredLinks['filtering']=="filtering" and
+ getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and
+ getFilteredLinks['file_conversion']['links2circos'] is True
+ )
+
+
+
+ (
+ getFilteredLinks['filtering']=="filtering" and
+ getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and
+ getFilteredLinks['file_conversion']['links2bed'] is True
+ )
+
+
+
+
+
+
+
+
+
+<general>
+input_format = bam
+sv_type = ${sv_type}
+mates_orientation=${mates_orientation}
+read1_length=${read1_length}
+read2_length=${read2_length}
+mates_file=${mates_file}
+cmap_file=${cmap_file}
+tmp_dir=$__new_file_path__/svdetect/tmp
+output_dir=$__new_file_path__/svdetect
+num_threads=8
+</general>
+
+#if $getLinks.linking == "linking"
+<detection>
+#if str($getLinks.splitmate) == "split"
+split_mate_file=1
+#else
+split_mate_file=0
+#end if
+window_size=${getLinks.window_size}
+step_length=${getLinks.step_length}
+</detection>
+#end if
+
+#if $getFilteredLinks.filtering == "filtering"
+<filtering>
+#if str($getFilteredLinks.splitlink) == "split"
+split_link_file=1
+#else
+split_link_file=0
+#end if
+#if str($getFilteredLinks.chromosomes) != ""
+chromosomes=${getFilteredLinks.chromosomes}
+#end if
+nb_pairs_threshold=${getFilteredLinks.nb_pairs_threshold}
+#if $getFilteredLinks.filter1.strand_filtering == "strand"
+strand_filtering=1
+final_score_threshold=${getFilteredLinks.filter1.final_score_threshold}
+#if $getFilteredLinks.filter1.filter2.order_filtering == "order"
+order_filtering=1
+mu_length=${getFilteredLinks.filter1.filter2.mu_length}
+sigma_length=${getFilteredLinks.filter1.filter2.sigma_length}
+nb_pairs_order_threshold=${getFilteredLinks.filter1.filter2.nb_pairs_order_threshold}
+#if $getFilteredLinks.filter1.filter2.filter3.insert_size_filtering == "insert"
+insert_size_filtering=1
+indel_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.indel_sigma_threshold}
+dup_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.dup_sigma_threshold}
+singleton_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.singleton_sigma_threshold}
+#else
+insert_size_filtering=0
+#end if
+#else
+order_filtering=0
+#end if
+#else
+strand_filtering=0
+#end if
+</filtering>
+#end if
+
+#if $getFilteredLinks.filtering == "filtering"
+#if $getFilteredLinks.file_conversion.file_conversion_select == "convert"
+#if str($getFilteredLinks.file_conversion.links2circos) == "create"
+<circos>
+organism_id=${getFilteredLinks.file_conversion.organism_id}
+<colorcode>
+#for $color_repeat in $getFilteredLinks.file_conversion.color_code
+${color_repeat.color}=${color_repeat.interval}
+#end for
+</colorcode>
+</circos>
+#end if
+#if str($getFilteredLinks.file_conversion.links2bed) == "create"
+<bed>
+<colorcode>
+#for $color_repeat in $getFilteredLinks.file_conversion.color_code
+#if str($color_repeat.color)== "grey"
+190,190,190=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "black"
+0,0,0=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "blue"
+0,0,255=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "green"
+0,255,0=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "purple"
+153,50,205=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "orange"
+255,140,0=${color_repeat.interval}
+#end if
+#if str($color_repeat.color)== "red"
+255,0,0=${color_repeat.interval}
+#end if
+#end for
+</colorcode>
+</bed>
+#end if
+#end if
+#end if
+
+
+
+
+**What it does**
+
+SVDetect - Version : 0.8b
+
+Parallel version (nCPU=8)
+
+SVDetect is a application for the isolation and the type prediction of intra- and inter-chromosomal rearrangements from paired-end/mate-pair sequencing data provided by the high-throughput sequencing technologies
+
+This tool aims to identifying structural variations (SVs) with both clustering and sliding-window strategies, and helping in their visualization at the genome scale.
+SVDetect is compatible with SOLiD and Illumina (>=1.3) reads.
+
+Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html
+
+-----
+
+.. class:: infomark
+
+Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect.
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/svdetect/circos_graph.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/svdetect/circos_graph.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,290 @@
+
+
+plots
+
+circos/bin/circos
+
+-conf '$circos_config_file'
+-outputfile '${outputfile}.dat'
+-png
+
+> '$log_file'
+
+;
+
+rm '$outputfile'; ln -s '${outputfile}.png' '$outputfile'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<ideogram>
+
+<spacing>
+
+default = 5u
+break = 1u
+
+axis_break_at_edge = yes
+axis_break = yes
+axis_break_style = 2
+
+<break_style 1>
+stroke_color = black
+fill_color = blue
+thickness = 0.25r
+stroke_thickness = 2
+</break>
+
+<break_style 2>
+stroke_color = black
+stroke_thickness = 3p
+thickness = 1.5r
+</break>
+
+</spacing>
+
+## thickness (px) of chromosome ideogram
+thickness = 100p
+stroke_thickness = 2
+## ideogram border color
+stroke_color = black
+fill = yes
+## the default chromosome color is set here and any value
+## defined in the karyotype file overrides it
+fill_color = black
+
+## fractional radius position of chromosome ideogram within image
+radius = 0.85r
+show_label = yes
+label_with_tag = yes
+label_font = condensedbold
+label_radius = dims(ideogram,radius) + 0.075r
+label_size = 60p
+
+## cytogenetic bands
+band_stroke_thickness = 2
+
+## show_bands determines whether the outline of cytogenetic bands
+## will be seen
+show_bands = yes
+## in order to fill the bands with the color defined in the karyotype
+## file you must set fill_bands
+fill_bands = yes
+
+</ideogram>
+
+
+
+
+
+show_ticks = yes
+show_tick_labels = yes
+
+<ticks>
+radius = dims(ideogram,radius_outer)
+multiplier = 1e-6
+
+<tick>
+spacing = 0.5u
+size = 2p
+thickness = 2p
+color = grey
+show_label = no
+label_size = 12p
+label_offset = 0p
+format = %.2f
+</tick>
+
+<tick>
+spacing = 1u
+size = 3p
+thickness = 2p
+color = dgrey
+show_label = no
+label_size = 12p
+label_offset = 0p
+format = %.2f
+</tick>
+
+<tick>
+spacing = 5u
+size = 5p
+thickness = 2p
+color = black
+show_label = yes
+label_size = 16p
+label_offset = 0p
+format = %d
+</tick>
+
+<tick>
+spacing = 10u
+size = 8p
+thickness = 2p
+color = black
+show_label = yes
+label_size = 20p
+label_offset = 5p
+format = %d
+</tick>
+</ticks>
+
+
+
+
+<colors>
+<<include etc/colors.conf>>
+</colors>
+
+<fonts>
+<<include etc/fonts.conf>>
+</fonts>
+
+<<include $ideogram_config_file>>
+<<include $ticks_config_file>>
+
+karyotype = $karyotype
+
+<image>
+24bit = yes
+##png = yes
+##svg = no
+## radius of inscribed circle in image
+radius = 1500p
+background = white
+## by default angle=0 is at 3 o'clock position
+angle_offset = -90
+#angle_orientation = counterclockwise
+
+auto_alpha_colors = yes
+auto_alpha_steps = 5
+</image>
+
+chromosomes_units= $chromosomes_units
+
+#if str($chromosomes)==""
+chromosomes_display_default = yes
+#else
+chromosomes_display_default = no
+chromosomes = $chromosomes
+#end if
+
+<links>
+
+z = 0
+radius = 0.95r
+bezier_radius = 0.2r
+
+<link segdup>
+show = yes
+color = dgrey_a5
+thickness = 2
+file = $link_file
+record_limit = 1000
+</link>
+
+</links>
+
+
+anglestep = 0.5
+minslicestep = 10
+beziersamples = 40
+debug = no
+warnings = no
+imagemap = no
+
+units_ok = bupr
+units_nounit = n
+
+
+
+
+**What it does**
+
+Circos
+
+Manual documentation available at the http://circos.ca/
+
+
+**Example of link segdup file**
+
+segdup file::
+
+ 1 hs1 1077096 1078746 color=red
+ 1 hs1 1080923 1082805 color=red
+ 2 hs1 1137684 1137961 color=red
+ 2 hs3 1138138 1138423 color=red
+ 3 hs11 1169417 1170000 color=red
+ 3 hs11 1170025 1170975 color=red
+ 4 hs11 1222480 1224271 color=green
+ 4 hs11 1223328 1225675 color=green
+ 5 hs12 1223336 1225812 color=grey
+ 5 hs13 1224709 1227633 color=grey
+ 6 hs11 1223621 1226460 color=red
+ 6 hs11 1224918 1227633 color=red
+ 7 hs11 1399510 1401513 color=white
+ 7 hs11 1401628 1403697 color=white
+ 8 hs15 1652045 1653746 color=red
+ 8 hs15 1657167 1658940 color=red
+ 9 hs11 165333 165887 color=white
+ 9 hs11 165981 168016 color=white
+ 10 hs11 1702700 1702841 color=red
+ 10 hs11 1702903 1703057 color=red
+ 11 hs11 1912272 1915186 color=white
+ 11 hs11 1937111 1939824 color=white
+ 12 hs11 1983211 1983355 color=red
+ 12 hs11 1983591 1983748 color=red
+ 13 hs11 2913657 2913898 color=white
+ 13 hs11 2914048 2914341 color=white
+ 14 hs11 3090593 3090749 color=purple
+ 14 hs11 3090709 3090864 color=purple
+ 15 hs21 3466365 3466434 color=red
+ 15 hs21 3466554 3466620 color=red
+ 16 hsX 3603073 3603321 color=white
+ 16 hsX 3603295 3603520 color=white
+
+
+
+-----
+
+.. class:: infomark
+
+Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of Circos.
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/varscan/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/varscan/tool_dependencies.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,19 @@
+
+
+
+
+
+ http://downloads.sourceforge.net/project/varscan/VarScan.v2.3.5.jar
+
+
+ $INSTALL_DIR/jars
+
+
+ $INSTALL_DIR/jars
+
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/varscan/varscan_mpileup.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/varscan/varscan_mpileup.pl Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,116 @@
+#!/usr/bin/perl
+
+use strict;
+use Cwd;
+
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+my $options ="";
+my $file="";
+my $command="";
+my $output="";
+my $working_dir = cwd();
+my $temp_vcf = "$working_dir/temp";
+my $log="";
+
+foreach my $input (@ARGV)
+{
+ my @tmp = split "::", $input;
+ if($tmp[0] eq "COMMAND")
+ {
+ $command = $tmp[1];
+ }
+ elsif($tmp[0] eq "INPUT")
+ {
+ $file = $tmp[1];
+ }
+ elsif($tmp[0] eq "OPTION")
+ {
+ $options = "$options ${tmp[1]}";
+ }
+ elsif($tmp[0] eq "OUTPUT")
+ {
+ $output = $tmp[1];
+ }
+ elsif($tmp[0] eq "LOG")
+ {
+ $log = $tmp[1];
+ }
+ else
+ {
+ die("Unknown Input: $input\n");
+ }
+}
+
+system ("$command $file $options 1>$temp_vcf 2>$log");
+
+vs2vcf($temp_vcf, $output);
+
+
+sub vs2vcf
+{
+
+ #
+ # G l o b a l v a r i a b l e s
+ #
+ my $version = '0.1';
+
+ #
+ # Read in file
+ #
+ my $input = shift;
+ my $output = shift;
+ my $chr_ord = shift;
+ open(IN, $input) or die "Can't open $input': $!\n";
+ open(OUT, ">$output") or die "Can't create $output': $!\n";
+ my %output;
+
+ while ( )
+ {
+ if ( /^#/ )
+ {
+ print OUT;
+ next;
+ }
+ chomp;
+ my $line = $_;
+
+ my @flds = split ( "\t", $line );
+ my $ref = $flds[3];
+ my $alt = $flds[4];
+ #
+ # Deletion of bases
+ #
+ if ( $alt =~ /^\-/ )
+ {
+ ($flds[3], $flds[4]) = ($ref.substr($alt,1), $ref);
+ }
+
+ #
+ # Insertion of bases
+ #
+ if ( $alt =~ /^\+/ )
+ {
+ $flds[4] = $ref.substr($alt,1);
+ }
+ print OUT join( "\t", @flds),"\n" unless defined $chr_ord;
+ $output{$flds[0]}{$flds[1]} = join( "\t", @flds)."\n" if defined $chr_ord;
+ }
+ close(IN);
+ # if chromosome order given return in sorted order
+ if(defined $chr_ord)
+ {
+ for my $chrom (@{ $chr_ord })
+ {
+ for my $pos (sort {$a<=>$b} keys %{ $output{$chrom} })
+ {
+ print OUT $output{$chrom}{$pos};
+ }
+ }
+ }
+ close(OUT);
+}
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/varscan/varscan_mpileup.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/varscan/varscan_mpileup.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,124 @@
+
+
+ mutation caller for targeted, exome, and whole-genome resequencing
+
+
+ VarScan
+
+
+
+ varscan_mpileup.pl
+ "COMMAND::java -jar \$JAVA_JAR_PATH/VarScan.v2.3.5.jar $exe_command"
+ "INPUT::$in_file"
+ "OUTPUT::$output"
+ "LOG::$log"
+ "OPTION::--min-coverage $min_coverage"
+ "OPTION::--min-reads2 $min_reads2"
+ "OPTION::--min-avg-qual $min_avg_qual"
+ "OPTION::--min-var-freq $min_var_freq"
+ "OPTION::--min-freq-for-hom $min_freq_for_hom"
+ "OPTION::--p-value $p_value"
+ "OPTION::--strand-filter $strand_filter"
+ "OPTION::--output-vcf 1"
+
+ #if ($vcf_sample_list):
+ "OPTION::--vcf-sample-list $vcf_sample_list"
+ #end if
+ "OPTION::--variants $variants"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+::
+
+ VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. It can be used to detect different types of variation:
+
+ Germline variants (SNPs an dindels) in individual samples or pools of samples.
+ Multi-sample variants (shared or private) in multi-sample datasets (with mpileup).
+ Somatic mutations, LOH events, and germline variants in tumor-normal pairs.
+ Somatic copy number alterations (CNAs) in tumor-normal exome data.
+
+
+**Input**
+
+::
+
+ mpileup file - The SAMtools mpileup file
+
+
+**Parameters**
+
+::
+
+ commands
+ mpileup2snp Identify SNPs from an mpileup file
+ mpileup2indel Identify indels an mpileup file
+ mpileup2cns Call consensus and variants from an mpileup file
+
+ min-coverage
+ Minimum read depth at a position to make a call [8]
+
+ min-reads2
+ Minimum supporting reads at a position to call variants [2]
+
+ min-avg-qual
+ Minimum base quality at a position to count a read [15]
+
+ min-var-freq
+ Minimum variant allele frequency threshold [0.01]
+
+ min-freq-for-hom
+ Minimum frequency to call homozygote [0.75]
+
+ p-value
+ Default p-value threshold for calling variants [99e-02]
+
+ strand-filter
+ Ignore variants with >90% support on one strand [1]
+
+ output-vcf
+ If set to 1, outputs in VCF format
+
+ vcf-sample-list
+ For VCF output, a list of sample names in order, one per line
+
+ variants
+ Report only variant (SNP/indel) positions [0]
+
+
+
+
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/varscan/varscan_somatic.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/varscan/varscan_somatic.pl Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,138 @@
+#!/usr/bin/perl
+
+
+use strict;
+use Cwd;
+
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+my $options ="";
+my $normal="";
+my $command="";
+my $tumor="";
+my $output="";
+my $working_dir = cwd();
+my $snp = "$working_dir/output.snp.vcf";
+my $indels = "$working_dir/output.indel.vcf";
+
+foreach my $input (@ARGV)
+{
+ my @tmp = split "::", $input;
+ if($tmp[0] eq "COMMAND")
+ {
+ $command = $tmp[1];
+ }
+ if($tmp[0] eq "NORMAL")
+ {
+ $normal = $tmp[1];
+ }
+ elsif($tmp[0] eq "TUMOR")
+ {
+ $tumor = $tmp[1];
+ }
+ elsif($tmp[0] eq "OPTION")
+ {
+ $options = "$options ${tmp[1]}";
+ }
+ elsif($tmp[0] eq "OUTPUT")
+ {
+ $output = $tmp[1];
+ }
+
+ else
+ {
+ die("Unknown Input: $input\n");
+ }
+}
+
+system ("$command $normal $tumor $options ");
+system("grep -v '^\#' $indels | grep -v '^chrom position' >> $snp");
+
+my @chr_ord = chromosome_order($tumor);
+
+vs2vcf($snp, $output,\@chr_ord);
+
+
+sub vs2vcf
+{
+
+ #
+ # G l o b a l v a r i a b l e s
+ #
+ my $version = '0.1';
+
+ #
+ # Read in file
+ #
+ my $input = shift;
+ my $output = shift;
+ my $chr_ord = shift;
+ open(IN, $input) or die "Can't open $input': $!\n";
+ open(OUT, ">$output") or die "Can't create $output': $!\n";
+ my %output;
+
+ while ( )
+ {
+ if ( /^#/ )
+ {
+ print OUT;
+ next;
+ }
+ chomp;
+ my $line = $_;
+
+ my @flds = split ( "\t", $line );
+ my $ref = $flds[3];
+ my $alt = $flds[4];
+ #
+ # Deletion of bases
+ #
+ if ( $alt =~ /^\-/ )
+ {
+ ($flds[3], $flds[4]) = ($ref.substr($alt,1), $ref);
+ }
+
+ #
+ # Insertion of bases
+ #
+ if ( $alt =~ /^\+/ )
+ {
+ $flds[4] = $ref.substr($alt,1);
+ }
+ print OUT join( "\t", @flds),"\n" unless defined $chr_ord;
+ $output{$flds[0]}{$flds[1]} = join( "\t", @flds)."\n" if defined $chr_ord;
+ }
+ close(IN);
+ # if chromosome order given return in sorted order
+ if(defined $chr_ord)
+ {
+ for my $chrom (@{ $chr_ord })
+ {
+ for my $pos (sort {$a<=>$b} keys %{ $output{$chrom} })
+ {
+ print OUT $output{$chrom}{$pos};
+ }
+ }
+ }
+ close(OUT);
+}
+
+
+sub chromosome_order
+{
+ my $input = shift;
+ # calculate flagstats
+ my $COMM = "samtools view -H $input | grep '^\@SQ'";
+ my @SQ = `$COMM`;
+ chomp @SQ;
+ for(my $i = 0; $i <= $#SQ; $i++)
+ {
+ $SQ[$i] =~ s/^\@SQ\tSN:(.*?)\tLN:\d+$/$1/;
+ }
+ return(@SQ);
+}
+
+
diff -r 2c7824a8d764 -r 0d10255b5434 ngsap-vc/varscan/varscan_somatic.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ngsap-vc/varscan/varscan_somatic.xml Thu Oct 03 10:42:15 2019 -0400
@@ -0,0 +1,131 @@
+
+
+ somatic mutation caller for cancer genomics
+
+
+ VarScan
+
+
+ varscan_somatic.pl
+ "COMMAND::java -jar \$JAVA_JAR_PATH/VarScan.v2.3.5.jar somatic"
+ "NORMAL::$normal"
+ "TUMOR::$tumor"
+ "OUTPUT::$output"
+
+ "OPTION::--min-coverage $min_coverage"
+ "OPTION::--min-coverage-normal $min_coverage_normal"
+ "OPTION::--min-coverage-tumor $min_coverage_tumor"
+
+ "OPTION::--min-var-freq $min_var_freq"
+ "OPTION::--min-freq-for-hom $min_freq_for_hom"
+
+ "OPTION::--normal-purity $normal_purity"
+ "OPTION::--tumor-purity $tumor_purity"
+
+ "OPTION::--p-value $p_value"
+ "OPTION::--somatic-p-value $somatic_p_value"
+
+ "OPTION::--strand-filter $strand_filter"
+ "OPTION::--validation $validation"
+ "OPTION::--output-vcf 1"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+::
+
+ VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. It can be used to detect different types of variation:
+
+ Germline variants (SNPs an dindels) in individual samples or pools of samples.
+ Multi-sample variants (shared or private) in multi-sample datasets (with mpileup).
+ Somatic mutations, LOH events, and germline variants in tumor-normal pairs.
+ Somatic copy number alterations (CNAs) in tumor-normal exome data.
+
+
+**Input**
+
+::
+
+ mpileup normal file - The SAMtools mpileup file for normal
+ mpileup tumor file - The SAMtools mpileup file for tumor
+
+
+**Parameters**
+
+::
+
+ min-coverage
+ Minimum read depth at a position to make a call [8]
+
+ min-coverage-normal
+ Minimum coverage in normal to call somatic [8]
+
+ min-coverage-tumor
+ Minimum coverage in tumor to call somatic [6]
+
+ min-var-freq
+ Minimum variant frequency to call a heterozygote [0.10]
+
+ min-freq-for-hom
+ Minimum frequency to call homozygote [0.75]
+
+ normal-purity
+ Estimated purity (non-tumor content) of normal sample [1.00]
+
+ tumor-purity
+ Estimated purity (tumor content) of tumor sample [1.00]
+
+ p-value
+ Default p-value threshold for calling variants [0.99]
+
+ somatic-p-value
+ P-value threshold to call a somatic site [0.05]
+
+ strand-filter
+ If set to 1, removes variants with >90% strand bias
+
+ validation
+ If set to 1, outputs all compared positions even if non-variant
+
+ output-vcf
+ If set to 1, outputs in VCF format [Default]
+
+
+
+
+
+