Velvet Galaxy Composite Dataset

# HG changeset patch # User scisjnu123 # Date 1568355683 14400 # Node ID 24d2dc34dd17c29953a7cff53d2bd8ee39a6cc99 # Parent e1ad03534fb204767ce4c6ef6b5c855bdf72bbcc Uploaded diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/analyze_covariates.xml --- a/GATK/gatk/analyze_covariates.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/base_recalibrator.xml --- a/GATK/gatk/base_recalibrator.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/combine_gvcfs.xml --- a/GATK/gatk/combine_gvcfs.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/combine_variants.xml --- a/GATK/gatk/combine_variants.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/gatk.xml --- a/GATK/gatk/gatk.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,179 +0,0 @@ - - - tool collection Version @VERSION@ - - gatk_macros.xml - realigner_target_creator.xml - indel_realigner.xml - base_recalibrator.xml - analyze_covariates.xml - print_reads.xml - haplotype_caller.xml - genotype_gvcfs.xml - combine_gvcfs.xml - combine_variants.xml - - - - - - - - - - - &1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2 -]]> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - analysis_type['analysis_type_selector'] == 'RealignerTargetCreator' - - - analysis_type['analysis_type_selector'] == 'IndelRealigner' - - - analysis_type['analysis_type_selector'] == 'BaseRecalibrator' - - - analysis_type['analysis_type_selector'] == 'AnalyzeCovariates' - - - analysis_type['analysis_type_selector'] == 'PrintReads' - - - analysis_type['analysis_type_selector'] == 'HaplotypeCaller' - - - analysis_type['analysis_type_selector'] == 'GenotypeGVCFs' - - - analysis_type['analysis_type_selector'] == 'CombineGVCFs' - - - analysis_type['analysis_type_selector'] == 'CombineVariants' - - - - - - 10.1101/gr.107524.110 - 10.1038/ng.806 - 10.1002/0471250953.bi1110s43 - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/gatk_macros.xml --- a/GATK/gatk/gatk_macros.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,166 +0,0 @@ - - - - - gatk - GATK_PATH - GATK_SITE_OPTIONS - package_r_for_gatk_3_4_0 - - - - - - - - 3.4-0 - ${tool.name} - ${analysis_type.analysis_type_selector} - - 1: - THREAD_STRING="-nct $cond_threads.nct" && - #end if - #if int($cond_threads.nt) > 1: - THREAD_STRING=$THREAD_STRING" -nt $cond_threads.nt" && - #end if - #if int($cond_threads.mem) > 0: - GATK_MEM=$cond_threads.mem && - #end if - #end if - java -Xmx\${GATK_MEM:-\${SLURM_MEM_PER_NODE:-4096}}M -jar "\$GATK_PATH/GenomeAnalysisTK.jar" \${THREAD_STRING:-} -]]> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/generation/gatk.xsl --- a/GATK/gatk/generation/gatk.xsl Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,162 +0,0 @@ - - - - - - - - tool collection Version @VERSION@ - - - gatk_macros.xml - -

- - - - - - - - - - - - - - - -<![CDATA[ - ############################ - ## import analysis specific preprocessings by using cheetahs internal searchList - ## if not defined, ignore - ############################ - #if $analysis_type.analysis_type_selector + "Preprocessing" in vars()['SL'][2] - #set $analysisPreprocessing = vars()['SL'][2][$analysis_type.analysis_type_selector + "Preprocessing"] - #include source=$analysisPreprocessing - #end if - - ############################ - ## GATK tool unspecific options - ############################ - @GATK_EXEC@ - - --analysis_type ${analysis_type.analysis_type_selector} - --reference_sequence ${ref_file.fields.path} - - --log_to_file ${output_log} - - #if $cond_intervals.cond_intervals_enabled - #for $interval in $cond_intervals.intervals: - --intervals ${interval.L} - #end for - #end if - - #if $cond_BQSR.cond_BQSR_enabled - --BQSR $cond_BQSR.BQSR - #end if - - ############################ - ## import analysis specific options by using cheetahs internal searchList - ## if not defined throw raw python error until better idea - ############################ - #if $analysis_type.analysis_type_selector + "Options" in vars()['SL'][2] - #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] - #include source=$analysisOptions - #else - #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] - #end if - - ############################ - ## only put ERROR or FATAL log messages into stderr - ## but keep full log for printing into log file - ############################ - 2>&1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2 -]]> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - analysis_type['analysis_type_selector'] == '' - - - - - - - - - 10.1101/gr.107524.110 - 10.1038/ng.806 - 10.1002/0471250953.bi1110s43 - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/generation/gatk.xsldb.xml --- a/GATK/gatk/generation/gatk.xsldb.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ - - - - RealignerTargetCreator - bam - rtc - realigner_target_creator.xml - - - IndelRealigner - bam - ir - indel_realigner.xml - - - BaseRecalibrator - bam - br - base_recalibrator.xml - - - AnalyzeCovariates - bam - ac - analyze_covariates.xml - - - PrintReads - bam - pr - print_reads.xml - - - HaplotypeCaller - bam - hc - haplotype_caller.xml - - - GenotypeGVCFs - gvcf - gg - genotype_gvcfs.xml - - - CombineGVCFs - gvcf - cg - combine_gvcfs.xml - - - CombineVariants - vcf - cv - combine_variants.xml - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/genotype_gvcfs.xml --- a/GATK/gatk/genotype_gvcfs.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/haplotype_caller.xml --- a/GATK/gatk/haplotype_caller.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/indel_realigner.xml --- a/GATK/gatk/indel_realigner.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/print_reads.xml --- a/GATK/gatk/print_reads.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/realigner_target_creator.xml --- a/GATK/gatk/realigner_target_creator.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/tool-data/destinations.py --- a/GATK/gatk/tool-data/destinations.py Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -from galaxy.jobs import JobDestination -import os -import sys -import json -import cStringIO -import logging - -log = logging.getLogger( __name__ ) - - -def dump(obj, nested_level=0, output=sys.stdout): - spacing = ' ' - if type(obj) == dict: - print >> output, '%s{' % ((nested_level) * spacing) - for k, v in obj.items(): - if hasattr(v, '__iter__'): - print >> output, '%s%s:' % ((nested_level + 1) * spacing, k) - dump(v, nested_level + 1, output) - else: - print >> output, '%s%s: %s' % ((nested_level + 1) * spacing, k, v) - print >> output, '%s}' % (nested_level * spacing) - elif type(obj) == list: - print >> output, '%s[' % ((nested_level) * spacing) - for v in obj: - if hasattr(v, '__iter__'): - dump(v, nested_level + 1, output) - else: - print >> output, '%s%s' % ((nested_level + 1) * spacing, v) - print >> output, '%s]' % ((nested_level) * spacing) - else: - print >> output, '%s%s' % (nested_level * spacing, obj) - - -def dynamic_slurm_cluster_gatk(job, tool_id): - # Allocate extra time - inp_data = dict( [ ( da.name, da.dataset ) for da in job.input_datasets ] ) - inp_data.update( [ ( da.name, da.dataset ) for da in job.input_library_datasets ] ) - inp_data.update( [ ( da.name, json.loads(da.value) ) for da in job.parameters ] ) - out = cStringIO.StringIO() - dump(inp_data, 1, out) - log.debug(out.getvalue()) - - nativeSpecs = '--nodes=1 --ntasks=1' - - # runner doesn't allow to specify --cpus-per-task - # thus the mem calculation gets messy with more than 1 node - # --> translate nt ==> nodes, nct ==> ntasks - - if 'cond_threads' not in inp_data: - return JobDestination(runner="slurm") - - if inp_data['cond_threads']['cond_threads_enabled'] == "True": - nNodes = int(inp_data['cond_threads']['nt']) - nCPU = int(inp_data['cond_threads']['nct']) - nMEM = int(inp_data['cond_threads']['mem']) - if nMEM > 0: - nativeSpecs = '--nodes=%d --ntasks=%d --mem=%d' % (nNodes, nCPU*nNodes, nMEM) - else: - nativeSpecs = '--nodes=%d --ntasks=%d' % (nNodes, nCPU*nNodes) - - return JobDestination(runner="slurm", params={"nativeSpecification": nativeSpecs}) - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/tool-data/picard_index.loc.sample --- a/GATK/gatk/tool-data/picard_index.loc.sample Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -#This is a sample file distributed with Galaxy that enables tools -#to use a directory of Picard dict and associated files. You will need -#to create these data files and then create a picard_index.loc file -#similar to this one (store it in this directory) that points to -#the directories in which those files are stored. The picard_index.loc -#file has this format (longer white space is the TAB character): -# -# -# -#So, for example, if you had hg18 indexed and stored in -#/depot/data2/galaxy/srma/hg18/, -#then the srma_index.loc entry would look like this: -# -#hg18 hg18 hg18 Pretty /depot/data2/galaxy/picard/hg18/hg18.fa -# -#and your /depot/data2/galaxy/srma/hg18/ directory -#would contain the following three files: -#hg18.fa -#hg18.dict -#hg18.fa.fai -# -#The dictionary file for each reference (ex. hg18.dict) must be -#created via Picard (http://picard.sourceforge.net). Note that -#the dict file does not have the .fa extension although the -#path list in the loc file does include it. -# diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/tool_data_table_conf.xml.sample --- a/GATK/gatk/tool_data_table_conf.xml.sample Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ - - - - value, dbkey, name, path - -

- diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/gatk/tool_dependencies.xml --- a/GATK/gatk/tool_dependencies.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ - - - - /mnt/galaxy/tools/GATK/3.4-0 - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/package_picard_1_135/tool_dependencies.xml --- a/GATK/package_picard_1_135/tool_dependencies.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ - - - - - - - https://github.com/broadinstitute/picard/releases/download/1.135/picard-tools-1.135.zip - - . - $INSTALL_DIR - - - - $INSTALL_DIR - - - - -This picard package dependency is retrieved directly from https://github.com/broadinstitute/picard/releases - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 GATK/package_r_for_gatk_3_4_0/tool_dependencies.xml --- a/GATK/package_r_for_gatk_3_4_0/tool_dependencies.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ - - - - - - - - - - - - - - https://github.com/cran/stringi/archive/0.5-5.tar.gz - https://github.com/cran/magrittr/archive/1.5.tar.gz - https://github.com/cran/stringr/archive/1.0.0.tar.gz - https://github.com/cran/RColorBrewer/archive/1.1-2.tar.gz - https://github.com/cran/dichromat/archive/2.0-0.tar.gz - https://github.com/cran/colorspace/archive/1.2-6.tar.gz - https://github.com/cran/munsell/archive/0.4.2.tar.gz - https://github.com/cran/labeling/archive/0.3.tar.gz - https://github.com/cran/Rcpp/archive/0.11.6.tar.gz - https://github.com/cran/digest/archive/0.6.8.tar.gz - https://github.com/cran/gtable/archive/0.1.2.tar.gz - https://github.com/cran/bitops/archive/1.0-6.tar.gz - https://github.com/cran/caTools/archive/1.17.1.tar.gz - https://github.com/cran/gtools/archive/3.5.0.tar.gz - https://github.com/cran/gdata/archive/2.17.0.tar.gz - https://github.com/cran/gsalib/archive/2.1.tar.gz - https://github.com/cran/gplots/archive/2.17.0.tar.gz - https://github.com/cran/plyr/archive/1.8.3.tar.gz - https://github.com/cran/reshape/archive/0.8.5.tar.gz - https://github.com/cran/reshape2/archive/1.4.1.tar.gz - https://github.com/cran/scales/archive/0.2.5.tar.gz - https://github.com/cran/proto/archive/0.3-10.tar.gz - https://github.com/cran/MASS/archive/7.3-43.tar.gz - https://github.com/cran/ggplot2/archive/1.0.1.tar.gz - - - - - ggplot2 is a plotting system for R, based on the grammar of graphics, which tries to take the good parts of base and lattice graphics and none of the bad parts. - It takes care of many of the fiddly details that make plotting a hassle (like drawing legends) as well as providing a powerful model of graphics that makes it easy to produce complex multi-layered graphics. - - http://ggplot2.org/ - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 Galaxy-Workflow-Haplotypecaller.ga --- a/Galaxy-Workflow-Haplotypecaller.ga Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -{ - "a_galaxy_workflow": "true", - "annotation": "", - "format-version": "0.1", - "name": "Haplotypecaller", - "steps": { - "0": { - "annotation": "", - "id": 0, - "input_connections": {}, - "inputs": [ - { - "description": "", - "name": "Input Dataset" - } - ], - "label": null, - "name": "Input dataset", - "outputs": [], - "position": { - "left": 175, - "top": 204 - }, - "tool_errors": null, - "tool_id": null, - "tool_state": "{\"name\": \"Input Dataset\"}", - "tool_version": null, - "type": "data_input", - "user_outputs": [], - "uuid": "e6bc3c94-7207-4af0-a856-666d00515f57" - }, - "1": { - "annotation": "", - "id": 1, - "input_connections": { - "analysis_type|cond_bam_input|input": { - "id": 0, - "output_name": "output" - } - }, - "inputs": [], - "label": null, - "name": "GATK", - "outputs": [ - { - "name": "rtc_output_intervals", - "type": "gatk_interval" - }, - { - "name": "ir_output_bam", - "type": "bam" - }, - { - "name": "br_table", - "type": "tabular" - }, - { - "name": "ac_plotsReportFile", - "type": "pdf" - }, - { - "name": "pr_output_bam", - "type": "bam" - }, - { - "name": "hc_output_gvcf", - "type": "vcf" - }, - { - "name": "gg_output_gvcf", - "type": "vcf" - }, - { - "name": "cg_output_vcf", - "type": "vcf" - }, - { - "name": "cv_output_vcf", - "type": "vcf" - }, - { - "name": "output_log", - "type": "txt" - } - ], - "position": { - "left": 381, - "top": 196 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/scisjnu123/test/gatk/3.4-0.d9", - "tool_state": "{\"__page__\": 0, \"cond_threads\": \"{\\\"cond_threads_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\", \"ref_file\": \"\\\"ebola\\\"\", \"__rerun_remap_job_id__\": null, \"cond_BQSR\": \"{\\\"cond_BQSR_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\", \"analysis_type\": \"{\\\"analysis_type_selector\\\": \\\"HaplotypeCaller\\\", \\\"cond_usage\\\": {\\\"emitRefConfidence\\\": \\\"GVCF\\\", \\\"__current_case__\\\": 0, \\\"cond_usage_selector\\\": \\\"GVCF\\\"}, \\\"optional_parameters\\\": {\\\"optional_parameters_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}, \\\"cond_bam_input\\\": {\\\"input\\\": null, \\\"all_in_one\\\": \\\"False\\\", \\\"__current_case__\\\": 1}, \\\"__current_case__\\\": 5}\", \"cond_intervals\": \"{\\\"cond_intervals_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\"}", - "tool_version": "3.4-0.d9", - "type": "tool", - "user_outputs": [], - "uuid": "7c602119-d676-4bf2-ac8d-0369e875e5fe" - } - }, - "uuid": "3855b137-2a43-4c8f-80f5-79d820d618ed" -} \ No newline at end of file diff -r e1ad03534fb2 -r 24d2dc34dd17 Galaxy-Workflow-SAMTools.ga --- a/Galaxy-Workflow-SAMTools.ga Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,129 +0,0 @@ -{ - "a_galaxy_workflow": "true", - "annotation": "This workflow uses SAMTools to call SNPs and INDELs.", - "format-version": "0.1", - "name": "SAMTools", - "steps": { - "0": { - "annotation": "", - "id": 0, - "input_connections": {}, - "inputs": [ - { - "description": "", - "name": ".bam file" - } - ], - "label": null, - "name": "Input dataset", - "outputs": [], - "position": { - "left": 200, - "top": 156 - }, - "tool_errors": null, - "tool_id": null, - "tool_state": "{\"name\": \".bam file\"}", - "tool_version": null, - "type": "data_input", - "user_outputs": [], - "uuid": "955f71da-3fad-46c4-a1d9-b0b1cb484569" - }, - "1": { - "annotation": "", - "id": 1, - "input_connections": {}, - "inputs": [ - { - "description": "", - "name": "Reference genome." - } - ], - "label": null, - "name": "Input dataset", - "outputs": [], - "position": { - "left": 512, - "top": 285 - }, - "tool_errors": null, - "tool_id": null, - "tool_state": "{\"name\": \"Reference genome.\"}", - "tool_version": null, - "type": "data_input", - "user_outputs": [], - "uuid": "7069b8d2-a89a-472a-8e18-b5b8f076de33" - }, - "2": { - "annotation": "", - "id": 2, - "input_connections": { - "inputFile": { - "id": 0, - "output_name": "output" - } - }, - "inputs": [], - "label": null, - "name": "SortSam", - "outputs": [ - { - "name": "outFile", - "type": "bam" - } - ], - "position": { - "left": 436.5, - "top": 121 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/avowinkel/picard/picard_SortSam/1.135", - "tool_state": "{\"inputFile\": \"null\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"sort_order\": \"\\\"coordinate\\\"\", \"validation_stringency\": \"\\\"LENIENT\\\"\"}", - "tool_version": "1.135", - "type": "tool", - "user_outputs": [], - "uuid": "b06955b6-6bba-4aaa-a919-fbb2aba9fdf5" - }, - "3": { - "annotation": "", - "id": 3, - "input_connections": { - "reference_source|input_bams_0|input_bam": { - "id": 2, - "output_name": "outFile" - }, - "reference_source|ref_file": { - "id": 1, - "output_name": "output" - } - }, - "inputs": [], - "label": null, - "name": "MPileup", - "outputs": [ - { - "name": "output_mpileup", - "type": "pileup" - }, - { - "name": "output_log", - "type": "txt" - } - ], - "position": { - "left": 746, - "top": 192 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/samtools_mpileup/samtools_mpileup/0.0.3", - "tool_state": "{\"__page__\": 0, \"advanced_options\": \"{\\\"advanced_options_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 1}\", \"__rerun_remap_job_id__\": null, \"genotype_likelihood_computation_type\": \"{\\\"genotype_likelihood_computation_type_selector\\\": \\\"do_not_perform_genotype_likelihood_computation\\\", \\\"__current_case__\\\": 1}\", \"reference_source\": \"{\\\"ref_file\\\": null, \\\"reference_source_selector\\\": \\\"history\\\", \\\"input_bams\\\": [{\\\"__index__\\\": 0, \\\"input_bam\\\": null}], \\\"__current_case__\\\": 1}\"}", - "tool_version": "0.0.3", - "type": "tool", - "user_outputs": [], - "uuid": "99b58983-e355-4357-b77e-8e29afcdc866" - } - }, - "uuid": "6b217aca-887a-4cca-91e4-446a7cc50726" -} \ No newline at end of file diff -r e1ad03534fb2 -r 24d2dc34dd17 Galaxy-Workflow-SVDetect.ga --- a/Galaxy-Workflow-SVDetect.ga Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,163 +0,0 @@ -{ - "a_galaxy_workflow": "true", - "annotation": "This workflows uses SVDetect to call structural variants.", - "format-version": "0.1", - "name": "SVDetect", - "steps": { - "0": { - "annotation": "", - "id": 0, - "input_connections": {}, - "inputs": [], - "label": null, - "name": "Import data", - "outputs": [ - { - "name": "outbamfile", - "type": "bam" - }, - { - "name": "outlenfile", - "type": "len" - }, - { - "name": "outsvfile", - "type": "sv" - } - ], - "position": { - "left": 164, - "top": 190 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_import/1.0.0", - "tool_state": "{\"file_name\": \"\\\"file1\\\"\", \"__rerun_remap_job_id__\": null, \"type\": \"{\\\"file_type\\\": \\\"bam\\\", \\\"__current_case__\\\": 0}\", \"file_path\": \"\\\"\\\"\", \"__page__\": 0}", - "tool_version": "1.0.0", - "type": "tool", - "user_outputs": [], - "uuid": "02a471b2-8d04-4407-9d24-a377f1ab66ef" - }, - "1": { - "annotation": "", - "id": 1, - "input_connections": {}, - "inputs": [], - "label": null, - "name": "Import data", - "outputs": [ - { - "name": "outbamfile", - "type": "bam" - }, - { - "name": "outlenfile", - "type": "len" - }, - { - "name": "outsvfile", - "type": "sv" - } - ], - "position": { - "left": 404, - "top": 354 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_import/1.0.0", - "tool_state": "{\"file_name\": \"\\\"file1\\\"\", \"__rerun_remap_job_id__\": null, \"type\": \"{\\\"file_type\\\": \\\"bam\\\", \\\"__current_case__\\\": 0}\", \"file_path\": \"\\\"\\\"\", \"__page__\": 0}", - "tool_version": "1.0.0", - "type": "tool", - "user_outputs": [], - "uuid": "d62c6f73-6a55-4ef7-8c2f-fe6371f45224" - }, - "2": { - "annotation": "", - "id": 2, - "input_connections": { - "inputBam": { - "id": 0, - "output_name": "outbamfile" - } - }, - "inputs": [], - "label": null, - "name": "BAM preprocessing", - "outputs": [ - { - "name": "abBAM", - "type": "bam" - }, - { - "name": "log", - "type": "txt" - }, - { - "name": "normBAM", - "type": "bam" - } - ], - "position": { - "left": 404, - "top": 190 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_preprocessing/1.0.0", - "tool_state": "{\"__page__\": 0, \"isizeMin\": \"\\\"0\\\"\", \"newBam\": \"{\\\"__current_case__\\\": 1, \\\"pairNormal\\\": \\\"no\\\"}\", \"isizeMax\": \"\\\"10000\\\"\", \"inputBam\": \"null\", \"__rerun_remap_job_id__\": null, \"nbrePair\": \"\\\"1000000\\\"\", \"readType\": \"\\\"1\\\"\", \"sample_name\": \"\\\"sample\\\"\", \"pairType\": \"\\\"1\\\"\", \"foldPair\": \"\\\"3.0\\\"\"}", - "tool_version": "1.0.0", - "type": "tool", - "user_outputs": [], - "uuid": "8fd607f0-b9ea-486c-9a34-b8bf53e6c611" - }, - "3": { - "annotation": "", - "id": 3, - "input_connections": { - "cmap_file": { - "id": 1, - "output_name": "outlenfile" - }, - "mates_file": { - "id": 2, - "output_name": "abBAM" - } - }, - "inputs": [], - "label": null, - "name": "Detect clusters of anomalously mapped pairs", - "outputs": [ - { - "name": "sv_file", - "type": "sv" - }, - { - "name": "circos_file", - "type": "segdup" - }, - { - "name": "bed_file", - "type": "bed" - }, - { - "name": "log_file", - "type": "txt" - } - ], - "position": { - "left": 639.5, - "top": 201 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_run_parallel/1.0.0", - "tool_state": "{\"__page__\": 0, \"sv_type\": \"\\\"all\\\"\", \"getLinks\": \"{\\\"window_size\\\": \\\"3000\\\", \\\"step_length\\\": \\\"250\\\", \\\"splitmate\\\": \\\"True\\\", \\\"__current_case__\\\": 1, \\\"linking\\\": \\\"linking\\\"}\", \"mates_file\": \"null\", \"__rerun_remap_job_id__\": null, \"getFilteredLinks\": \"{\\\"nb_pairs_threshold\\\": \\\"5\\\", \\\"filtering\\\": \\\"filtering\\\", \\\"chromosomes\\\": \\\"\\\", \\\"file_conversion\\\": {\\\"file_conversion_select\\\": \\\"do_not_convert\\\", \\\"__current_case__\\\": 0}, \\\"splitlink\\\": \\\"False\\\", \\\"filter1\\\": {\\\"strand_filtering\\\": \\\"strand\\\", \\\"final_score_threshold\\\": \\\"1.0\\\", \\\"__current_case__\\\": 1, \\\"filter2\\\": {\\\"nb_pairs_order_threshold\\\": \\\"2\\\", \\\"order_filtering\\\": \\\"order\\\", \\\"__current_case__\\\": 1, \\\"filter3\\\": {\\\"indel_sigma_threshold\\\": \\\"3.0\\\", \\\"dup_sigma_threshold\\\": \\\"3.0\\\", \\\"singleton_sigma_threshold\\\": \\\"4.0\\\", \\\"__current_case__\\\": 1, \\\"insert_size_filtering\\\": \\\"insert\\\"}, \\\"mu_length\\\": \\\"3000\\\", \\\"sigma_length\\\": \\\"250\\\"}}, \\\"__current_case__\\\": 1, \\\"links2SV\\\": \\\"True\\\"}\", \"read2_length\": \"\\\"50\\\"\", \"read1_length\": \"\\\"50\\\"\", \"sample_name\": \"\\\"sample\\\"\", \"mates_orientation\": \"\\\"FR\\\"\", \"cmap_file\": \"null\"}", - "tool_version": "1.0.0", - "type": "tool", - "user_outputs": [], - "uuid": "d4109ce9-9ae3-4095-afd3-d16a4b900804" - } - }, - "uuid": "ddc77ce0-31e9-4e37-b1f8-7188bf46a9a4" -} \ No newline at end of file diff -r e1ad03534fb2 -r 24d2dc34dd17 Galaxy-Workflow-VARSCAN.ga --- a/Galaxy-Workflow-VARSCAN.ga Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -{ - "a_galaxy_workflow": "true", - "annotation": "This workflow call SNPs, INDELs and CNSs.", - "format-version": "0.1", - "name": "VARSCAN", - "steps": { - "0": { - "annotation": "", - "id": 0, - "input_connections": {}, - "inputs": [ - { - "description": "", - "name": ".bam file." - } - ], - "label": null, - "name": "Input dataset", - "outputs": [], - "position": { - "left": 200, - "top": 135 - }, - "tool_errors": null, - "tool_id": null, - "tool_state": "{\"name\": \".bam file.\"}", - "tool_version": null, - "type": "data_input", - "user_outputs": [], - "uuid": "f0436072-1b25-459c-ad0b-a9e5379bcb93" - }, - "1": { - "annotation": "", - "id": 1, - "input_connections": {}, - "inputs": [ - { - "description": "", - "name": "Reference genome." - } - ], - "label": null, - "name": "Input dataset", - "outputs": [], - "position": { - "left": 202.5, - "top": 211 - }, - "tool_errors": null, - "tool_id": null, - "tool_state": "{\"name\": \"Reference genome.\"}", - "tool_version": null, - "type": "data_input", - "user_outputs": [], - "uuid": "197c41ff-2653-4195-b961-d6e0444a8c73" - }, - "2": { - "annotation": "", - "id": 2, - "input_connections": { - "reference_source|input_bams_0|input_bam": { - "id": 0, - "output_name": "output" - }, - "reference_source|ref_file": { - "id": 1, - "output_name": "output" - } - }, - "inputs": [], - "label": null, - "name": "MPileup", - "outputs": [ - { - "name": "output_mpileup", - "type": "pileup" - }, - { - "name": "output_log", - "type": "txt" - } - ], - "position": { - "left": 444.5, - "top": 141 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/samtools_mpileup/samtools_mpileup/0.0.3", - "tool_state": "{\"__page__\": 0, \"advanced_options\": \"{\\\"advanced_options_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 1}\", \"__rerun_remap_job_id__\": null, \"genotype_likelihood_computation_type\": \"{\\\"genotype_likelihood_computation_type_selector\\\": \\\"do_not_perform_genotype_likelihood_computation\\\", \\\"__current_case__\\\": 1}\", \"reference_source\": \"{\\\"ref_file\\\": null, \\\"reference_source_selector\\\": \\\"history\\\", \\\"input_bams\\\": [{\\\"__index__\\\": 0, \\\"input_bam\\\": null}], \\\"__current_case__\\\": 1}\"}", - "tool_version": "0.0.3", - "type": "tool", - "user_outputs": [], - "uuid": "da60e628-cc66-417a-8419-d0ada8df76e0" - }, - "3": { - "annotation": "", - "id": 3, - "input_connections": { - "in_file": { - "id": 2, - "output_name": "output_mpileup" - } - }, - "inputs": [], - "label": null, - "name": "VarScan mpileup", - "outputs": [ - { - "name": "output", - "type": "vcf" - }, - { - "name": "log", - "type": "txt" - } - ], - "position": { - "left": 710.5, - "top": 190 - }, - "post_job_actions": {}, - "tool_errors": null, - "tool_id": "toolshed.g2.bx.psu.edu/repos/fcaramia/varscan/varscan_mpileup/2.3.5", - "tool_state": "{\"strand_filter\": \"\\\"1\\\"\", \"min_avg_qual\": \"\\\"15\\\"\", \"min_coverage\": \"\\\"8\\\"\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"min_freq_for_hom\": \"\\\"0.75\\\"\", \"min_var_freq\": \"\\\"0.01\\\"\", \"exe_command\": \"\\\"mpileup2snp\\\"\", \"min_reads2\": \"\\\"2\\\"\", \"p_value\": \"\\\"0.99\\\"\", \"variants\": \"\\\"1\\\"\", \"vcf_sample_list\": \"null\", \"in_file\": \"null\"}", - "tool_version": "2.3.5", - "type": "tool", - "user_outputs": [], - "uuid": "ad908d2b-e0dc-4742-8c03-e612aab136aa" - } - }, - "uuid": "cd50a688-f133-497a-94b5-974f6c41af9e" -} \ No newline at end of file diff -r e1ad03534fb2 -r 24d2dc34dd17 picard/read_group_macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/picard/read_group_macros.xml Fri Sep 13 02:21:23 2019 -0400 @@ -0,0 +1,294 @@ + + + +#def identifier_or_name($input1) + #if hasattr($input1, 'element_identifier') + #return $input1.element_identifier + #else + #return $input1.name.rstrip('.gz').rstrip('.fastq').rstrip('.fq') + #end if +#end def + +#def clean(name) + #import re + #set $name_clean = re.sub('[^\w\-_\.]', '_', $name) + #return $name_clean +#end def + +#def read_group_name_default($input1, $input2=None) + #if $input2 is None + #return $clean($identifier_or_name($input1)) + #else + #import itertools + #set $input_name1 = $clean($identifier_or_name($input1)) + #set $input_name2 = $clean($identifier_or_name($input2)) + #set $common_prefix = ''.join([c[0] for c in itertools.takewhile(lambda x: all(x[0] == y for y in x), itertools.izip(*[$input_name1, $input_name2]))]) + #if len($common_prefix) > 3 + #return $common_prefix + #else + #return $input_name1 + #end if + #end if +#end def + +#def format_read_group(prefix, value, quote='', arg='') + #if $value + #return $arg + $quote + $prefix + $value + $quote + #else + #return '' + #end if +#end def + +#def rg_param(name) + #if $varExists("rg") + #return $rg.get($name, None) + #else + #return $getVar($name, None) + #end if +#end def + +#set $use_rg = True + + + +#if $use_rg + #if $rg_param('read_group_id_conditional') is None + #set $rg_id = $rg_auto_name + #elif $rg_param('read_group_id_conditional').do_auto_name + #set $rg_id = $rg_auto_name + #else + #set $rg_id = str($rg_param('read_group_id_conditional').ID) + #end if + + #if $rg_param('read_group_sm_conditional') is None + #set $rg_sm = '' + #elif $rg_param('read_group_sm_conditional').do_auto_name + #set $rg_sm = $rg_auto_name + #else + #set $rg_sm = str($rg_param('read_group_sm_conditional').SM) + #end if + + #if $rg_param('PL') + #set $rg_pl = str($rg_param('PL')) + #else + #set $rg_pl = '' + #end if + + #if $rg_param('read_group_lb_conditional') is None + #set $rg_lb = '' + #elif $rg_param('read_group_lb_conditional').do_auto_name + #set $rg_lb = $rg_auto_name + #else + #set $rg_lb = str($rg_param('read_group_lb_conditional').LB) + #end if + + #if $rg_param('CN') + #set $rg_cn = str($rg_param('CN')) + #else + #set $rg_cn = '' + #end if + + #if $rg_param("DS") + #set $rg_ds = str($rg_param("DS")) + #else + #set $rg_ds = '' + #end if + + #if $rg_param("DT") + #set $rg_dt = str($rg_param("DT")) + #else + #set $rg_dt = '' + #end if + + #if $rg_param("FO") + #set $rg_fo = str($rg_param("FO")) + #else + #set $rg_fo = '' + #end if + + #if $rg_param("KS") + #set $rg_ks = str($rg_param("KS")) + #else + #set $rg_ks = '' + #end if + + #if $rg_param("PG") + #set $rg_pg = str($rg_param("PG")) + #else + #set $rg_pg = '' + #end if + + #if str($rg_param("PI")) + #set $rg_pi = str($rg_param("PI")) + #else + #set $rg_pi = '' + #end if + + #if $rg_param("PU") + #set $rg_pu = str($rg_param("PU")) + #else + #set $rg_pu = '' + #end if +#end if + + +#set $use_rg = str($rg.rg_selector) != "do_not_set" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \*|[ACMGRSVTWYHKDBN]+$ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r e1ad03534fb2 -r 24d2dc34dd17 sr_assembly/velvetg.xml --- a/sr_assembly/velvetg.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,301 +0,0 @@ - - Velvet sequence assembler for very short reads - velvetg 2>&1 | grep "Version" | sed -e 's/Version //' - - velvetg_wrapper.py - '$input.extra_files_path' - #if $generate_amos.afg == "yes": - -amos_file $generate_amos.afg - #end if - #if $unused_reads.generate_unused == "yes": - -unused_reads $unused_reads.generate_unused - #end if - $read_trkg - #if $coverage.cutoff == "auto": - -cov_cutoff auto - #elif $coverage.cutoff == "value": - -cov_cutoff $coverage.cov_cutoff - #end if - #if $expected.coverage == "auto": - -exp_cov auto - #elif $expected.coverage == "value": - -exp_cov $expected.exp_cov - #end if - #if $contig_lgth.use_contig_lgth == "yes": - -min_contig_lgth $contig_lgth.min_contig_lgth - #end if - #if $reads.paired == "yes": - #if int($reads.ins_length) > 0: - -ins_length $reads.ins_length - #end if - #if $reads.options.advanced == "yes": - #if int($reads.options.ins_length_sd) > 0: - -ins_length_sd $reads.options.ins_length_sd - #end if - #if int($reads.options.ins_length2) > 0: - -ins_length2 $reads.options.ins_length2 - #end if - #if int($reads.options.ins_length2_sd) > 0: - -ins_length2_sd $reads.options.ins_length2_sd - #end if - #if int($reads.options.ins_length_long) > 0: - -ins_length_long $reads.options.ins_length_long - #end if - #if int($reads.options.ins_length_long_sd) > 0: - -ins_length_long_sd $reads.options.ins_length_long_sd - #end if - #if int($reads.options.max_branch_length) > 0: - -max_branch_length $reads.options.max_branch_length - #end if - #if int($reads.options.max_divergence) > 0: - -max_divergence $reads.options.max_divergence - #end if - #if int($reads.options.max_gap_count) > 0: - -max_gap_count $reads.options.max_gap_count - #end if - #if int($reads.options.min_pair_count) > 0: - -min_pair_count $reads.options.min_pair_count - #end if - #if int($reads.options.max_coverage) > 0: - -max_coverage $reads.options.max_coverage - #end if - #if int($reads.options.long_mult_cutoff) > 0: - -long_mult_cutoff $reads.options.long_mult_cutoff - #end if - $reads.options.scaffolding - #end if - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - read_trkg is True - - - last_graph['generate_graph'] == "yes" - - - generate_amos['afg'] == "yes" - - - unused_reads['generate_unused'] == "yes" - - - - - - velvet - - - - - - - - - - - - - - - - - - - - - - - - -**Velvet Overview** - -Velvet_ is a de novo genomic assembler specially designed for short read sequencing technologies, such as Solexa or 454, developed by Daniel Zerbino and Ewan Birney at the European Bioinformatics Institute (EMBL-EBI), near Cambridge, in the United Kingdom. - -Velvet currently takes in short read sequences, removes errors then produces high quality unique contigs. It then uses paired-end read and long read information, when available, to retrieve the repeated areas between contigs. - -Read the Velvet `documentation`__ for details on using the Velvet Assembler. - -.. _Velvet: http://www.ebi.ac.uk/~zerbino/velvet/ - -.. __: http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf - ------- - -**Input formats** - -Velvet can input sequence files in the following formats: fasta fastq fasta.gz fastq.gz eland gerald - -The input files are prepared for the velvet assembler using **velveth**. - ------- - -**Outputs** - -**Contigs** - -The *contigs.fa* file. -This fasta file contains the sequences of the contigs longer than 2k, where k is the word-length used in velveth. If you have specified a min contig length threshold, then the contigs shorter than that value are omitted. -Note that the length and coverage information provided in the header of each contig should therefore be understood in k-mers and in k-mer coverage (cf. 5.1) respectively. -The N's in the sequence correspond to gaps between scaffolded contigs. The number of N's corresponds to the estimated length of the gap. For reasons of compatibility with the archives, any gap shorter than 10bp is represented by a sequence of 10 N's. - -**Stats** - -The *stats.txt* file. -This file is a simple tabbed-delimited description of the nodes. The column names are pretty much self-explanatory. Note however that node lengths are given in k-mers. To obtain the length in nucleotides of each node you simply need to add k - 1, where k is the word-length used in velveth. -The in and out columns correspond to the number of arcs on the 5' and 3' ends of the contig respectively. -The coverages in columns short1 cov, short1 Ocov, short2 cov, and short2 Ocov are provided in k-mer coverage (5.1). -Also, the difference between # cov and # Ocov is the way these values are computed. In the first count, slightly divergent sequences are added to the coverage tally. However, in the second, stricter count, only the sequences which map perfectly onto the consensus sequence are taken into account. - -**LastGraph** - -The *LastGraph* file. -This file describes in its entirety the graph produced by Velvet. - -**AMOS.afg** - -The *velvet_asm.afg* file. -This file is mainly designed to be read by the open-source AMOS genome assembly package. Nonetheless, a number of programs are available to transform this kind of file into other assembly file formats (namely ACE, TIGR, Arachne and Celera). See http://amos.sourceforge.net/ for more information. -The file describes all the contigs contained in the contigs.fa file (cf 4.2.1). - ------- - -**Velvet parameter list** - -This is a list of implemented Velvetg options:: - - Standard options: - -cov_cutoff floating-point|auto : removal of low coverage nodes AFTER tour bus or allow the system to infer it - (default: no removal) - -ins_length integer : expected distance between two paired end reads (default: no read pairing) - -read_trkg yes|no : tracking of short read positions in assembly (default: no tracking) - -min_contig_lgth integer : minimum contig length exported to contigs.fa file (default: hash length * 2) - -amos_file yes|no : export assembly to AMOS file (default: no export) - -exp_cov floating point|auto : expected coverage of unique regions or allow the system to infer it - (default: no long or paired-end read resolution) - - Advanced options: - -ins_length2 integer : expected distance between two paired-end reads in the second short-read dataset (default: no read pairing) - -ins_length_long integer : expected distance between two long paired-end reads (default: no read pairing) - -ins_length*_sd integer : est. standard deviation of respective dataset (default: 10% of corresponding length) - [replace '*' by nothing, '2' or '_long' as necessary] - -scaffolding yes|no : scaffolding of contigs used paired end information (default: on) - -max_branch_length integer : maximum length in base pair of bubble (default: 100) - -max_divergence floating-point : maximum divergence rate between two branches in a bubble (default: 0.2) - -max_gap_count integer : maximum number of gaps allowed in the alignment of the two branches of a bubble (default: 3) - -min_pair_count integer : minimum number of paired end connections to justify the scaffolding of two long contigs (default: 10) - -max_coverage floating point : removal of high coverage nodes AFTER tour bus (default: no removal) - -long_mult_cutoff int : minimum number of long reads required to merge contigs (default: 2) - -unused_reads yes|no : export unused reads in UnusedReads.fa file (default: no) - - Output: - directory/contigs.fa : fasta file of contigs longer than twice hash length - directory/stats.txt : stats file (tab-spaced) useful for determining appropriate coverage cutoff - directory/LastGraph : special formatted file with all the information on the final graph - directory/velvet_asm.afg : (if requested) AMOS compatible assembly file - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 sr_assembly/velvetg_wrapper.py --- a/sr_assembly/velvetg_wrapper.py Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -#!/usr/bin/env python - -""" -Classes encapsulating decypher tool. -James E Johnson - University of Minnesota -""" -import os -import sys -import subprocess - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - - -def __main__(): - #Parse Command Line - working_dir = sys.argv[1] - inputs = ' '.join(sys.argv[2:]) - for _ in ('Roadmaps', 'Sequences'): - os.symlink(os.path.join(working_dir, _), _) - cmdline = 'velvetg . %s' % (inputs) - print "Command to be executed: %s" % cmdline - try: - proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE ) - returncode = proc.wait() - # get stderr, allowing for case where it's very large - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += proc.stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - if returncode != 0: - raise Exception, stderr - except Exception, e: - stop_err( 'Error running velvetg ' + str( e ) ) - - -if __name__ == "__main__": - __main__() diff -r e1ad03534fb2 -r 24d2dc34dd17 sr_assembly/velveth.xml --- a/sr_assembly/velveth.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,129 +0,0 @@ - - Prepare a dataset for the Velvet velvetg Assembler - velveth 2>&1 | grep "Version" | sed -e 's/Version //' - - velveth_wrapper.py - '$out_file1' '$out_file1.extra_files_path' - $hash_length - $strand_specific - #for $i in $inputs - ${i.file_format} - ${i.read_type} - ${i.input} - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - velvet - - - - - - - - - - - - - - - - - - -**Velvet Overview** - -Velvet_ is a de novo genomic assembler specially designed for short read sequencing technologies, such as Solexa or 454, developed by Daniel Zerbino and Ewan Birney at the European Bioinformatics Institute (EMBL-EBI), near Cambridge, in the United Kingdom. - -Velvet currently takes in short read sequences, removes errors then produces high quality unique contigs. It then uses paired-end read and long read information, when available, to retrieve the repeated areas between contigs. - -Read the Velvet `documentation`__ for details on using the Velvet Assembler. - -.. _Velvet: http://www.ebi.ac.uk/~zerbino/velvet/ - -.. __: http://www.ebi.ac.uk/~zerbino/velvet/Manual.pdf - ------- - -**Velveth** - -Velveth takes in a number of sequence files, produces a hashtable, then outputs two files in an output directory (creating it if necessary), Sequences and Roadmaps, which are necessary to velvetg. - ------- - -**Hash Length** - -The hash length, also known as k-mer length, corresponds to the length, in base pairs, of the words being hashed. - -The hash length is the length of the k-mers being entered in the hash table. Firstly, you must observe three technical constraints:: - -# it must be an odd number, to avoid palindromes. If you put in an even number, Velvet will just decrement it and proceed. -# it must be below or equal to MAXKMERHASH length (cf. 2.3.3, by default 31bp), because it is stored on 64 bits -# it must be strictly inferior to read length, otherwise you simply will not observe any overlaps between reads, for obvious reasons. - -Now you still have quite a lot of possibilities. As is often the case, it's a trade- off between specificity and sensitivity. Longer kmers bring you more specificity (i.e. less spurious overlaps) but lowers coverage (cf. below). . . so there's a sweet spot to be found with time and experience. -We like to think in terms of "k-mer coverage", i.e. how many times has a k-mer been seen among the reads. The relation between k-mer coverage Ck and standard (nucleotide-wise) coverage C is Ck = C # (L - k + 1)/L where k is your hash length, and L you read length. -Experience shows that this kmer coverage should be above 10 to start getting decent results. If Ck is above 20, you might be "wasting" coverage. Experience also shows that empirical tests with different values for k are not that costly to run! - -**Input Files** - -Velvet works mainly with fasta and fastq formats. For paired-end reads, the assumption is that each read is next to its mate -read. In other words, if the reads are indexed from 0, then reads 0 and 1 are paired, 2 and 3, 4 and 5, etc. - -Supported file formats are:: - - fasta - fastq - fasta.gz - fastq.gz - eland - gerald - -Read categories are:: - - short (default) - shortPaired - short2 (same as short, but for a separate insert-size library) - shortPaired2 (see above) - long (for Sanger, 454 or even reference sequences) - longPaired - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 sr_assembly/velveth_wrapper.py --- a/sr_assembly/velveth_wrapper.py Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ -#!/usr/bin/env python - -""" -Classes encapsulating decypher tool. -James E Johnson - University of Minnesota -""" -import pkg_resources -import logging, os, string, sys, tempfile, glob, shutil, types, urllib -import shlex, subprocess -from optparse import OptionParser, OptionGroup -from stat import * - - -log = logging.getLogger( __name__ ) - -assert sys.version_info[:2] >= ( 2, 4 ) - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - s = 'velveth_wrapper.py: argv = %s\n' % (sys.argv) - argcnt = len(sys.argv) - html_file = sys.argv[1] - working_dir = sys.argv[2] - try: # for test - needs this done - os.makedirs(working_dir) - except Exception, e: - stop_err( 'Error running velveth ' + str( e ) ) - hash_length = sys.argv[3] - inputs = string.join(sys.argv[4:],' ') - cmdline = 'velveth %s %s %s > /dev/null' % (working_dir, hash_length, inputs) - try: - proc = subprocess.Popen( args=cmdline, shell=True, stderr=subprocess.PIPE ) - returncode = proc.wait() - # get stderr, allowing for case where it's very large - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += proc.stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - if returncode != 0: - raise Exception, stderr - except Exception, e: - stop_err( 'Error running velveth ' + str( e ) ) - sequences_path = os.path.join(working_dir,'Sequences') - roadmaps_path = os.path.join(working_dir,'Roadmaps') - rval = ['Velvet Galaxy Composite Dataset

'] - rval.append('

' % (cmdline) ) - rval.append('

This composite dataset is composed of the following files:

%s %s
%s %s

' ) - f = file(html_file,'w') - f.write("\n".join( rval )) - f.write('\n') - f.close() - -if __name__ == "__main__": __main__() diff -r e1ad03534fb2 -r 24d2dc34dd17 suite_samtools_0_1_19/repository_dependencies.xml --- a/suite_samtools_0_1_19/repository_dependencies.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/BAM_preprocessingPairs.pl --- a/svdetect/BAM_preprocessingPairs.pl Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,340 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use warnings; -use Getopt::Std; -my $version = '0.5_galaxy'; - -my $SAMTOOLS_BIN_DIR="/home/cbl/Downloads/samtools-1.9/"; - -my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); - -getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY - -my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; - -my $pt_bad_mates_file=$opts{b}; #GALAXY -my $pt_log_file=$opts{l}; #GALAXY -my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY - - -die(qq/ - -Description: - - Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input - for SVDetect. - - From all pairs mapped onto the reference genome, this script outputs abnormal pairs: - - mapped on two different chromosomes - - with an incorrect strand orientation and\/or pair order - - with an insert size distance +- sigma threshold - into a file - - -BAM\/SAM File input format only, sorted by read names - - Version : $version - SAMtools required for BAM files - - -Usage: BAM_preprocessingPairs.pl [options] - -Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] - -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}] - -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}] - -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] - -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] - -f REAL minimal number of sigma fold for filtering pairs [$opts{f}] - -d dump normal pairs into a file [] (optional) - -o STRING output directory [$working_dir] - -\n/) if (@ARGV == 0 && -t STDIN); - -unless (-d $opts{o}){ - mkdir $opts{o} or die; -} -$opts{o}.="/" if($opts{o}!~/\/$/); - -my $mates_file=shift(@ARGV); - -$mates_file=readlink($mates_file); - -my $bad_mates_file=(split(/\//,$mates_file))[$#_]; - -if($bad_mates_file=~/.(s|b)am$/){ - $bad_mates_file=~s/.(b|s)am$/.ab.sam/; - $bad_mates_file=$opts{o}.$bad_mates_file; -} - -else{ - die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; -} - -my $good_mates_file; -if($opts{d}){ - $good_mates_file=(split(/\//,$mates_file))[$#_]; - $good_mates_file=~s/.(b|s)am$/.norm.sam/; - $good_mates_file=$opts{o}.$good_mates_file; -} - -my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY - -#------------------------------------------------------------------------------# -#Calculate mu and sigma - -open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; - -print LOG "\# Calculating mu and sigma lengths...\n"; -print LOG "-- file=$mates_file\n"; -print LOG "-- n=$opts{n}\n"; -print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; - -my ($record, $sumX,$sumX2) = (0,0,0); -my $warn=$opts{n}/10; -my $prev_pair="FIRST"; - -my $bam=($mates_file =~ /.bam$/)? 1:0; - -if($bam){ - open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; -}else{ - open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; -} - -while(){ - - my @t=split; - - next if ($t[0]=~/^@/); - - my $current_pair=$t[0]; - next if($current_pair eq $prev_pair); - $prev_pair=$current_pair; - - my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); - - next if (($t[1]&0x0004) || ($t[1]&0x0008)); - next if ($length<$opts{s} || $length>$opts{S}) ; - - if($chr2 eq "="){ - - $sumX += $length; #add to sum and sum^2 for mean and variance calculation - $sumX2 += $length*$length; - $record++; - } - - if($record>$warn){ - print LOG "-- $warn pairs analysed\n"; - $warn+=$warn; - } - - last if ($record>$opts{n}); - -} -close (MATES); - -$record--; -my $mu = $sumX/$record; -my $sigma = sqrt($sumX2/$record - $mu*$mu); - -print LOG "-- Total : $record pairs analysed\n"; -print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Preprocessing pairs - -$warn=100000; - -$record=0; -my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); - -my $read_type=($opts{t})? "Illumina":"SOLiD"; -my $pair_type=($opts{p})? "paired-end":"mate-paired"; - -print LOG "\# Preprocessing pairs...\n"; -print LOG "-- file= $mates_file\n"; -print LOG "-- type= $read_type $pair_type reads\n"; -print LOG "-- sigma threshold= $opts{f}\n"; -print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; - -my @header; - -if($bam){ - open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; - @header=

; - close HEADER; - open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; -}else{ - open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; -} - -open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; -print AB @header if($bam); - -if($opts{d}){ - open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; - print NORM @header if($bam); -} - -$prev_pair="FIRST"; -my $prev_bad; - -while(){ - - my @t=split; - my $bad=0; - - if ($t[0]=~/^@/){ - print AB; - print NORM if ($opts{d}); - next; - } - - my $current_pair=$t[0]; - if($current_pair eq $prev_pair){ - next if($prev_bad==-1); - if($prev_bad){ - print AB; - }elsif(!$prev_bad){ - print NORM if($opts{d}); - } - next; - } - - $prev_pair=$current_pair; - - my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); - - if (($t[1]&0x0004) || ($t[1]&0x0008)){ - $prev_bad=-1; - $count{unmap}++; - $record++; - next; - - } - - my $strand1 = (($t[1]&0x0010))? 'R':'F'; - my $strand2 = (($t[1]&0x0020))? 'R':'F'; - my $order1 = (($t[1]&0x0040))? '1':'2'; - my $order2 = (($t[1]&0x0080))? '1':'2'; - - if($order1 == 2){ - ($strand1,$strand2)=($strand2,$strand1); - ($chr1,$chr2)=($chr2,$chr1); - ($pos1,$pos2)=($pos2,$pos1); - ($order1,$order2)=($order2,$order1); - } - - my $sense=$strand1.$strand2; - - if($chr1 ne "=" && $chr2 ne "="){ - $bad=1; - $count{chr}++; - } - - if($opts{p}){ #paired-end - if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ - $bad=1; - $count{sense}++; - } - }else{ #mate-pair - if($opts{t}){ #Illumina - if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ - $bad=1; - $count{sense}++; - } - }else{ #SOLiD - if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ - $bad=1; - $count{sense}++; - } - } - } - - if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ - $bad=1; - $count{dist}++; - } - - if($bad){ - print AB; - $count{ab}++; - $prev_bad=$bad; - }else{ - print NORM if ($opts{d}); - $count{norm}++; - $prev_bad=$bad; - } - - $record++; - - if($record>$warn){ - print LOG "-- $warn pairs analysed\n"; - $warn+=100000; - } -} - -close AB; -close NORM if($opts{d}); - -print LOG "-- Total : $record pairs analysed\n"; -print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; -print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; -print LOG "---- $count{ab} abnormal mapped pairs\n"; -print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; -print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; -print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; -print LOG "--- $count{norm} correct mapped pairs\n"; - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#OUTPUT - -if($bam){ - - my $bam_file=$bad_mates_file; - $bam_file=~s/.sam$/.bam/; - print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; - system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); - unlink($bad_mates_file); - print LOG "-- output created: $bam_file\n"; - - system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY - - if($opts{d}){ - $bam_file=$good_mates_file; - $bam_file=~s/.sam$/.bam/; - print LOG "\# Converting sam to bam for correct mapped pairs\n"; - system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); - unlink($good_mates_file); - print LOG "-- output created: $bam_file\n"; - - system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY - - } - -} - -else{ - print LOG "-- output created: $bad_mates_file\n"; - print LOG "-- output created: $good_mates_file\n" if($opts{d}); -} - -close LOG; - -system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY - - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub decimal{ - - my $num=shift; - my $digs_to_cut=shift; - - $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); - - return $num; -} -#------------------------------------------------------------------------------# diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/BAM_preprocessingPairs.xml --- a/svdetect/BAM_preprocessingPairs.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ - - - to get abnormal pairs - - BAM_preprocessingPairs.pl -t '$readType' -p '$pairType' -n '$nbrePair' -s '$isizeMin' -S '$isizeMax' -f '$foldPair' -o $__new_file_path__/svdetect -b '$abBAM' -l '$log' -N $sample_name - #if $newBam.pairNormal=="yes" - -d -x '$normBAM' - #end if - '$inputBam' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - newBam['pairNormal'] == 'yes' - - - - - -**What it does** - -Bam_preprocessingPairs - Version 0.5 - -Preprocessing of mates to get anomalously mapped mate-pair/paired-end reads as input for SVDetect. - -From all pairs mapped onto the reference genome, this script outputs abnormal pairs: - - * mapped on two different chromosomes - * with an incorrect strand orientation and/or pair order - * with an insert size distance +- sigma threshold - -into a file prefix.ab.bam/sam sorted by read names - --BAM/SAM File input format only. - -SAMtools required for BAM files - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/SVDetect_compare.pl --- a/svdetect/SVDetect_compare.pl Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,716 +0,0 @@ -#!/usr/bin/perl -w - -=pod - -=head1 NAME - -SVDetect Compare for Galaxy - -Version: 0.8b for Galaxy - -=head1 SYNOPSIS - -SVDetect_compare.pl links2compare -conf [-help] [-man] - -=cut - -# ------------------------------------------------------------------- - -use strict; -use warnings; - -use Pod::Usage; -use Getopt::Long; - -use Config::General; -use Tie::IxHash; - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#PARSE THE COMMAND LINE -my %OPT; -GetOptions(\%OPT, - 'conf=s', - 'out1=s', #GALAXY - 'out2=s', #GALAXY - 'out3=s', #GALAXY - 'out4=s', #GALAXY - 'out5=s', #GALAXY - 'out6=s', #GALAXY - 'out7=s', #GALAXY - 'out8=s', #GALAXY - 'out9=s', #GALAXY - 'l=s', #GALAXY - 'N=s', #GALAXY - 'help', - 'man' - ); - -pod2usage() if $OPT{help}; -pod2usage(-verbose=>2) if $OPT{man}; -pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); - - -pod2usage() if(@ARGV<1); - -tie (my %func, 'Tie::IxHash',links2compare=>\&links2compare); - -foreach my $command (@ARGV){ - pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); -} -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#READ THE CONFIGURATION FILE -my $conf=Config::General->new( -ConfigFile => $OPT{conf}, - -Tie => "Tie::IxHash", - -AllowMultiOptions => 1, - -LowerCaseNames => 1, - -AutoTrue => 1); -my %CONF= $conf->getall; -validateconfiguration(\%CONF); #validation of the configuration parameters - - -my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY -my $BEDTOOLS_BIN_DIR="/bioinfo/local/BEDTools/bin"; #GALAXY - -my $pt_log_file=$OPT{l}; #GALAXY -my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_compare.log"; #GALAXY -open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY - -my @pt_sv_file=($OPT{out1},$OPT{out2},$OPT{out3}) if($OPT{out1}); #GALAXY common,sample,reference -my @pt_circos_file=($OPT{out4},$OPT{out5},$OPT{out6}) if($OPT{out4}); #GALAXY common,sample,reference -my @pt_bed_file=($OPT{out7},$OPT{out8},$OPT{out9}) if($OPT{out7}); #GALAXY common,sample,reference - -$CONF{compare}{sample_link_file}=readlink($CONF{compare}{sample_link_file});#GALAXY -$CONF{compare}{sample_link_file}=~s/.sv.txt//; #GALAXY - -$CONF{compare}{reference_link_file}=readlink($CONF{compare}{reference_link_file});#GALAXY -$CONF{compare}{reference_link_file}=~s/.sv.txt//; #GALAXY - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#COMMAND EXECUTION -foreach my $command (@ARGV){ - &{$func{$command}}(); -} -print LOG "-- end\n"; - -close LOG;#GALAXY -system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY - -exit(0); -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#FUNCTIONS - -# -----------------------------------------------------------------------------# -#MAIN FUNCTION number 5:Comparison between samples, common or specific links -sub links2compare{ - - my @compare_files; - - compareSamples($CONF{general}{output_dir}, - $CONF{compare}{list_samples}, - $CONF{compare}{sample_link_file}, - $CONF{compare}{reference_link_file}, - $CONF{compare}{min_overlap}, - $CONF{compare}{same_sv_type}, - \@compare_files); - - my $pt_ind=0; - - for my $input_file (@compare_files){ - - $input_file=$CONF{general}{output_dir}.$input_file; - - my $output_file=$input_file; - $output_file=~s/unique$/compared/; - - sortLinks($input_file, $output_file,1); - - if($CONF{compare}{circos_output}){ - links2segdup($CONF{circos}{organism_id}, - $CONF{circos}{colorcode}, - $output_file, - $output_file.".segdup.txt"); - system "rm $pt_circos_file[$pt_ind]; ln -s $output_file.segdup.txt $pt_circos_file[$pt_ind]" if (defined $pt_circos_file[$pt_ind]); #GALAXY - } - - if($CONF{compare}{bed_output}){ - links2bedfile($CONF{compare}{read_lengths}, - $CONF{bed}{colorcode}, - $output_file, - $output_file.".bed"); - system "rm $pt_bed_file[$pt_ind]; ln -s $output_file.bed $pt_bed_file[$pt_ind]" if (defined $pt_bed_file[$pt_ind]); #GALAXY - } - - if($CONF{compare}{sv_output}){ - - links2SVfile ($output_file, $output_file.".sv.txt"); - system "rm $pt_sv_file[$pt_ind]; ln -s $output_file.sv.txt $pt_sv_file[$pt_ind]" if (defined $pt_sv_file[$pt_ind]); #GALAXY - } - $pt_ind++; - - } - unlink(@compare_files); - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub compareSamples{ - - my ($dir,$list_samples,$sample_file,$reference_file,$min_overlap,$same_sv_type,$file_names)=@_; - - my @bedpefiles; - my @list=split(",",$list_samples); - my @list_files=($sample_file,$reference_file); - - print LOG "\# Comparison procedure...\n"; - print LOG "-- samples=$list_samples\n". - "-- minimum overlap=$min_overlap\n". - "-- same SV type=$same_sv_type\n"; - - #conversion of links to bedPE format file - print LOG "-- Conversion of links.filtered files to bedPE format\n"; - for my $s (0..$#list) { - - links2bedPElinksfile($list[$s],$list_files[$s],$list_files[$s].".bedpe.txt"); - push(@bedpefiles,$list_files[$s].".bedpe.txt"); - - } - - #get common links between all samples compared - print LOG "-- Getting common links between all samples with BEDTools\n"; - my $common_name=join(".",@list); - - my $nb=scalar @list; - my $command=""; - my $prompt=">"; - - while ($nb>0){ - - for my $i (0..$#list_files){ - - $command.="$BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a ".$list_files[$i].".bedpe.txt"; - my $pipe=0; - - for my $j ($i+1..$#list_files){ - - $command.="| $BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a stdin" if($pipe); - $command.=" -b ".$list_files[$j].".bedpe.txt"; - $pipe=1; - - } - - $command.=$prompt.$dir.$common_name.".bedpe.tmp;"; - $prompt=">>"; - - my $first=shift(@list_files); push(@list_files,$first); - last; - } - $nb--; - } - - system ($command); - - push(@bedpefiles,$dir.$common_name.".bedpe.tmp"); - - #Post comparison to get common links if same type only (as an option) - open( FILE, "<".$dir.$common_name.".bedpe.tmp") or die "Can't open".$dir.$common_name.".bedpe.tmp : $!"; - open( OUT, ">".$dir.$common_name.".bedpe.unique") or die "Can't write in ".$dir.$common_name.".bedpe.unique : $!"; - - while(){ - my @t=split("\t",$_); - my $s=(split("_",$t[6]))[0]; - my ($sv1,$sv2)=($t[7],$t[18]); - splice(@t,11,$#t); - - if($same_sv_type){ - print OUT join("\t",@t)."\n" if($sv1 eq $sv2); - }else{ - print OUT join("\t",@t)."\n"; - } - } - close FILE; - close OUT; - - bedPElinks2linksfile($dir.$common_name.".bedpe.unique", $dir.$common_name.".unique"); - push(@bedpefiles,$dir.$common_name.".bedpe.unique"); - push(@$file_names,$common_name.".unique"); - print LOG "-- output created: ".$dir.$common_name.".compared\n"; - - #get specific links for each sample - print LOG "-- Getting specific links for each sample\n"; - for my $s (0..$#list) { - system("grep -Fxv -f ".$dir.$common_name.".bedpe.unique ".$list_files[$s].".bedpe.txt >".$dir.$list[$s].".bedpe.unique"); - bedPElinks2linksfile($dir.$list[$s].".bedpe.unique",$dir.$list[$s].".unique"); - push(@bedpefiles,$dir.$list[$s].".bedpe.unique"); - push(@$file_names,$list[$s].".unique"); - print LOG "-- output created: ".$dir.$list[$s].".compared\n"; - } - - unlink(@bedpefiles); - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the circos format -sub links2segdup{ - - my($id,$color_code,$links_file,$segdup_file)=@_; - - print LOG "\# Converting to the circos format...\n"; - - tie (my %hcolor,'Tie::IxHash'); #color-code hash table - foreach my $col (keys %{$color_code}){ - my ($min_links,$max_links)=split(",",$color_code->{$col}); - $hcolor{$col}=[$min_links,$max_links]; - } - - open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; - open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; - - my $index=1; - while(){ - - my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; - - my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links - - print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output - "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; - $index++; - } - - close LINKS; - close SEGDUP; - print LOG "-- output created: $segdup_file\n"; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the bedPE format for BEDTools usage -sub links2bedPElinksfile{ - - my ($sample,$links_file,$bedpe_file)=@_; - - open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; - open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; - - my $nb_links=1; - - while(){ - - chomp; - my @t=split("\t",$_); - my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); - my $type=($chr1 eq $chr2)? "INTRA":"INTER"; - $type.="_".$t[10]; - - $start1--; $start2--; - - print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". - "\t$sample"."_link$nb_links\t$type\t.\t.". - "\t".join("|",@t)."\n"; - - $nb_links++; - } - - close LINKS; - close BEDPE; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub bedPElinks2linksfile{ - - my ($bedpe_file,$links_file)=@_; - - open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; - open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; - - while(){ - - chomp; - my $sample=(split("_",(split("\t",$_))[6]))[0]; - my @t1=(split("\t",$_))[0,1,2,3,4,5]; - my @t2=split(/\|/,(split("\t",$_))[10]); - push(@t2,$sample); - - print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; - - } - close BEDPE; - close LINKS; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the bed format -sub links2bedfile{ - - my ($tag_length,$color_code,$links_file,$bed_file)=@_; - - print LOG "\# Converting to the bed format...\n"; - - my $compare=1; - if($links_file!~/compared$/){ - $compare=0; - $tag_length->{none}->{1}=$tag_length->{1}; - $tag_length->{none}->{2}=$tag_length->{2}; - } - - #color-code hash table - tie (my %hcolor,'Tie::IxHash'); - my %color_order; - $color_order{"255,255,255"}=0; - my $n=1; - foreach my $col (keys %{$color_code}){ - my ($min_links,$max_links)=split(",",$color_code->{$col}); - $hcolor{$col}=[$min_links,$max_links]; - $color_order{$col}=$n; - $n++; - } - - my %pair; - my %pt; - $n=1; - open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; - - my %str=( "F"=>"+", "R"=>"-" ); - - while(){ - - my @t=split; - my $sample=($compare)? pop(@t):"none"; - - my $chr1=$t[0]; - my $chr2=$t[3]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); - $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); - my $same_chr=($chr1 eq $chr2)? 1:0; - - my $count=$t[6]; - my $color=getColor($count,\%hcolor,"bed"); - - my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); - my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); - my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); - my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); - my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); - my @position1=deleteBadOrderSensePairs(split(",",$t[14])); - my @position2=deleteBadOrderSensePairs(split(",",$t[15])); - my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); - my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); - - - for my $p (0..$#pairs){ - - if (!exists $pair{$pairs[$p]}){ - - if($same_chr){ - - $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, - $start1[$p]-1, $end2[$p], $color, - 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; - $pt{$n}=$pair{$pairs[$p]}->{0}; - $n++; - - }else{ - - $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, - $start1[$p]-1, $end1[$p], $color, - 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; - $pt{$n}=$pair{$pairs[$p]}->{1}; - $n++; - - - $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, - $start2[$p]-1, $end2[$p], $color, - 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; - $pt{$n}=$pair{$pairs[$p]}->{2}; - $n++; - } - }else{ - - if($same_chr){ - ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); - }else{ - ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); - ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); - } - } - } - } - close LINKS; - - my $nb_pairs=$n-1; - - open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; - print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". - "visibility=2 itemRgb=\"On\"\n"; - - for my $i (1..$nb_pairs){ - print BED join("\t",@{$pt{$i}})."\n"; - } - - close BED; - - print LOG "-- output created: $bed_file\n"; - - undef %pair; - undef %pt; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub links2SVfile{ - - my($links_file,$sv_file)=@_; - - print LOG "\# Converting to the sv output table...\n"; - open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; - open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; - - my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist - chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering - final_score breakpoint1_start1-end1 breakpoint2_start2-end2); - - my $nb_links=0; - - while (){ - - my @t=split; - my @sv=(); - my $sv_type="-"; - my $strand_ratio="-"; - my $eq_ratio="-"; - my $eq_type="-"; - my $insert_ratio="-"; - my $link="-"; - my ($bk1, $bk2)=("-","-"); - my $score="-"; - - my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); - my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); - my $nb_pairs=$t[6]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); - $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); - my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; - - #if strand filtering - if (defined $t[16]){ - #if inter-chr link - $sv_type=$t[16]; - if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ - $strand_ratio=floor($1/$2*100)."%"; - $score=$t[18]; - } - if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ - #if intra-chr link with insert size filtering - $strand_ratio=floor($1/$2*100)."%"; - $link=floor($t[17]); - if($sv_type!~/^INV/){ - $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); - $score=$t[20]; - }else{ - $score=$t[19]; - } - } - } - - if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ - - #if strand and order filtering only and/or interchr link - $eq_type=$t[18]; - $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); - ($bk1, $bk2)=($t[20],$t[21]); - foreach my $bk ($bk1, $bk2){$bk=~s/\),$/ /g; $bk=~s/(\(|$)//g; $bk=~s/,/-/g;} - $score=$t[22]; - - }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ - - #if all three filtering - $link=floor($t[17]); - $eq_type=$t[19]; - $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); - - if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ - $insert_ratio=floor($1/$2*100)."%"; - ($bk1, $bk2)=($t[22],$t[23]); - $score=$t[24]; - - }else{ - ($bk1, $bk2)=($t[21],$t[22]); - $score=$t[23]; - } - foreach my $bk ($bk1, $bk2){$bk=~s/\),$/ /g; $bk=~s/(\(|$)//g; $bk=~s/,/-/g;} - - } - - - push(@sv, $chr_type, $sv_type,$eq_type); - push(@sv,"$chr1\t$start1-$end1"); - push(@sv, $link); - push(@sv,"$chr2\t$start2-$end2", - $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); - - - print SV join("\t",@sv)."\n"; - } - - close LINKS; - close SV; - - system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; - - open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; - my @links=; - close SV; - - open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; - - print SV join("\t",@header)."\n"; - print SV @links; - close SV; - - unlink($sv_file.".sorted"); - - print LOG "-- output created: $sv_file\n"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub deleteBadOrderSensePairs{ - - my (@tab)=@_; - my @tab2; - - foreach my $v (@tab){ - - $v=~s/[]//g; - push(@tab2,$v) if($v!~/[\$\*\@]$/); - } - return @tab2; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#gets starts and ends Coords when start=leftmost given positions, directions and orders -sub getCoordswithLeftMost { - - my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; - - for my $i (0..scalar(@{$positions})-1) { - - if($strand->[$i] eq 'F'){ - $starts->[$i]=$positions->[$i]; - $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; - }else{ - $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; - $ends->[$i]=$positions->[$i]; - } - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub floor { - my $nb = $_[0]; - $nb=~ s/\..*//; - return $nb; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub decimal{ - - my $num=shift; - my $digs_to_cut=shift; - - $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); - - return $num; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Sort links according the concerned chromosomes and their coordinates -sub sortLinks{ - - my ($links_file,$sortedlinks_file,$unique)=@_; - - print LOG "# Sorting links...\n"; - - my $pipe=($unique)? "| sort -u":""; - system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getColor{ - - my($count,$hcolor,$format)=@_; - for my $col ( keys % { $hcolor} ) { - return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); - } - return "white" if($format eq "circos"); - return "255,255,255" if($format eq "bed"); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#check if the configuration file is correct -sub validateconfiguration{ - - my %conf=%{$_[0]}; - my $list_prgs="@ARGV"; - - my @circos_params=qw(organism_id colorcode); - my @bed_params=qw(colorcode); - my @compare_params=qw(list_samples list_read_lengths sample_link_file reference_link_file); - - unless (defined($conf{general}{output_dir})) { - $conf{general}{output_dir} = "."; - } - unless (-d $conf{general}{output_dir}){ - mkdir $conf{general}{output_dir} or die; - } - $conf{general}{output_dir}.="/" if($conf{general}{output_dir}!~/\/$/); - - - if($list_prgs=~/links2compare/){ - foreach my $p (@compare_params) { - die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); - } - - unless (defined($conf{compare}{same_sv_type})) { - $conf{compare}{same_sv_type} = 0; - } - - unless (defined($conf{compare}{min_overlap})) { - $conf{compare}{min_overlap} = 1E-9; - } - - if($conf{compare}{circos_output}){ - foreach my $p (@circos_params) { - next if($list_prgs=~/^ratio/ && $p eq "colorcode"); - die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); - } - } - if($conf{compare}{bed_output}){ - foreach my $p (@bed_params) { - die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); - } - die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); - - my @samples=split(",",$conf{compare}{list_samples}); - my @read_lengths=split(",",$conf{compare}{list_read_lengths}); - for my $i (0..$#samples){ - my @l=split("-",$read_lengths[$i]); - $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; - } - } - } - - -} -#------------------------------------------------------------------------------# -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/SVDetect_compare.xml --- a/svdetect/SVDetect_compare.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,218 +0,0 @@ - - -structural variants between two samples - -SVDetect_compare.pl links2compare -conf '$config_file' -l '$log_file' -N '$sample_name.$reference_name' - -#if $links2SV --out1 '$common_sv_file' --out2 '$sample_sv_file' --out3 '$reference_sv_file' -#end if - -#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2circos --out4 '$common_circos_file' --out5 '$sample_circos_file' --out6 '$reference_circos_file' -#end if - -#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2bed --out7 '$common_bed_file' --out8 '$sample_bed_file' --out9 '$reference_bed_file' -#end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - links2SV is True - - - links2SV is True - - - links2SV is True - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2circos'] is True - ) - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2circos'] is True - ) - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2circos'] is True - ) - - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2bed'] is True - ) - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2bed'] is True - ) - - - - ( - file_conversion['file_conversion_select']=="convert" and - file_conversion['links2bed'] is True - ) - - - - - - - - - - -<general> -output_dir=$__new_file_path__/svdetect -</general> - -#if $file_conversion.file_conversion_select == "convert" -#if $file_conversion.links2circos -<circos> -organism_id=${file_conversion.organism_id} -<colorcode> -#for $color_repeat in $file_conversion.color_code -${color_repeat.color}=${color_repeat.interval} -#end for -</colorcode> -</circos> -#end if -#if $file_conversion.links2bed -<bed> -<colorcode> -#for $color_repeat in $file_conversion.color_code -#if str($color_repeat.color)== "grey" -190,190,190=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "black" -0,0,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "blue" -0,0,255=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "green" -0,255,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "purple" -153,50,205=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "orange" -255,140,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "red" -255,0,0=${color_repeat.interval} -#end if -#end for -</colorcode> -</bed> -#end if -#end if - -<compare> -list_samples=${sample_name},${reference_name} -list_read_lengths=${sample_read1_length}-${sample_read2_length},${reference_read1_length}-${reference_read2_length} -sample_link_file=${sample_mates_file} -reference_link_file=${reference_mates_file} -min_overlap=${min_overlap} -same_sv_type=${same_sv_type} -sv_output=${links2SV} -#if $file_conversion.file_conversion_select == "convert" -circos_output=${$file_conversion.links2circos} -bed_output=${$file_conversion.links2bed} -#end if -</compare> - - - - - -**What it does** - -SVDetect - Version : 0.8b - -Comparison of clusters between two samples to get common or sample-specific SVs - -This program is designed to compare filtered links between two anomalously mapped mate-pair/paired-end datasets -and to identify common and sample-specific SVs (like the usual sample/reference design). -Overlaps between coordinates of clusters and types of SVs are used as parameters of comparison. - -Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/SVDetect_import.sh --- a/svdetect/SVDetect_import.sh Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -#!/bin/bash - - -while getopts "i:o:" optionName; do -case "$optionName" in - -i) INPUT="$OPTARG";; -o) OUTPUT="$OPTARG";; - -esac -done - -rm $OUTPUT - -ln -s $INPUT $OUTPUT diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/SVDetect_import.xml --- a/svdetect/SVDetect_import.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ - - BAM, chromosome info or sv files - SVDetect_import.sh -i $file_path - #if str($type.file_type)=="bam" - -o $outbamfile - #elif str($type.file_type)=="len" - -o $outlenfile - #elif str($type.file_type)=="sv" - -o $outsvfile - #end if - - - - - - - - - - - - - - - - - - - - - - - - type['file_type']=="bam" - - - type['file_type']=="len" - - - type['file_type']=="sv" - - - -**What it does** - -This tool allows you to import quickly a BAM file, a chromosome info file or a SVDetect output file from you computer as inputs for SVDetect. - - -**Example of chromosome file** - -Input len file:: - - 1 chr1 247249719 - 2 chr2 242951149 - 3 chr3 199501827 - 4 chr4 191273063 - 5 chr5 180857866 - 6 chr6 170899992 - 7 chr7 158821424 - 8 chr8 146274826 - 9 chr9 140273252 - 10 chr10 135374737 - 11 chr11 134452384 - 12 chr12 132349534 - 13 chr13 114142980 - 14 chr14 106368585 - 15 chr15 100338915 - 16 chr16 88827254 - 17 chr17 78774742 - 18 chr18 76117153 - 19 chr19 63811651 - 20 chr20 62435964 - 21 chr21 46944323 - 22 chr22 49691432 - 23 chrX 154913754 - 24 chrY 57772954 - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/SVDetect_run_parallel.pl --- a/svdetect/SVDetect_run_parallel.pl Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3537 +0,0 @@ -#!/usr/bin/perl -w - -=pod - -=head1 NAME - -SVDetect - Program designed to the detection of structural variations -from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads - -Version: 0.8b for Galaxy - -=head1 SYNOPSIS - -SVDetect -conf [-help] [-man] - - Command: - - linking detection and isolation of links - filtering filtering of links according different parameters - links2circos links conversion to circos format - links2bed paired-ends of links converted to bed format (UCSC) - links2SV formatted output to show most significant SVs - cnv calculate copy-number profiles - ratio2circos ratio conversion to circos density format - ratio2bedgraph ratio conversion to bedGraph density format (UCSC) - -=head1 DESCRIPTION - -This is a command-line interface to SVDetect. - - -=head1 AUTHORS - -Bruno Zeitouni Ebruno.zeitouni@curie.frE, -Valentina Boeva Evalentina.boeva@curie.frE - -=cut - -# ------------------------------------------------------------------- - -use strict; -use warnings; - -use Pod::Usage; -use Getopt::Long; - -use Config::General; -use Tie::IxHash; -use FileHandle; -use Parallel::ForkManager; - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#PARSE THE COMMAND LINE -my %OPT; -GetOptions(\%OPT, - 'conf=s', - 'out1=s', #GALAXY - 'out2=s', #GALAXY - 'out3=s', #GALAXY - 'out4=s', #GALAXY - 'out5=s', #GALAXY - 'l=s', #GALAXY - 'N=s',#GALAXY - 'help',#GALAXY - 'man' - ); - -pod2usage() if $OPT{help}; -pod2usage(-verbose=>2) if $OPT{man}; -pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); - -pod2usage() if(@ARGV<1); - -tie (my %func, 'Tie::IxHash',linking=>\&createlinks, - filtering=>\&filterlinks, - links2circos=>\&links2circos, - links2bed=>\&links2bed, - links2compare=>\&links2compare, - links2SV=>\&links2SV, - cnv=>\&cnv, - ratio2circos=>\&ratio2circos, - ratio2bedgraph=>\&ratio2bedgraph); - -foreach my $command (@ARGV){ - pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); -} -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#READ THE CONFIGURATION FILE -my $conf=Config::General->new( -ConfigFile => $OPT{conf}, - -Tie => "Tie::IxHash", - -AllowMultiOptions => 1, - -LowerCaseNames => 1, - -AutoTrue => 1); -my %CONF= $conf->getall; -validateconfiguration(\%CONF); #validation of the configuration parameters - -my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY - -my $pt_log_file=$OPT{l}; #GALAXY -my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY -my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY -my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY -my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY -my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY - -$CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY -$CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY - -my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY -open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#COMMAND EXECUTION -foreach my $command (@ARGV){ - &{$func{$command}}(); -} -print LOG "-- end\n";#GALAXY - -close LOG;#GALAXY -system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY -exit(0); -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# - - -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# -#FUNCTIONS -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 1: Detection of links from mate-pairs data -sub createlinks{ - - my %CHR; #main hash table 1: fragments, links - my %CHRID; - my @MATEFILES; - - my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; - my @path=split(/\//,$output_prefix); - $output_prefix=$CONF{general}{output_dir}.$path[$#path]; - my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; - my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; - - shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters - $CONF{detection}{window_size}, - $CONF{detection}{step_length}, - $CONF{general}{cmap_file}); - - if($CONF{detection}{split_mate_file}){ - - splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, - $CONF{general}{sv_type}, - $CONF{general}{mates_file}, - $CONF{general}{input_format}, - $CONF{general}{read_lengths} - ); - }else{ - - @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!"; - chomp(@MATEFILES); - print LOG "# Splitted mate files already created.\n"; - } - - - #Parallelization of the linking per chromosome for intra + interchrs - my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); - - foreach my $matefile (@MATEFILES){ - - my $pid = $pm->start and next; - getlinks(\%CHR, \%CHRID, $matefile); - $pm->finish; - - } - $pm->wait_all_children; - - #Merge the chromosome links file into only one - my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!"; - chomp(@LINKFILES); - catFiles( \@LINKFILES => "$output_prefix.links" ); - - system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY - print LOG "# Linking end procedure : output created: $output_prefix.links\n"; - #unlink(@LINKFILES); - #unlink(@MATEFILES); - - undef %CHR; - undef %CHRID; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getlinks { - - my ($chr,$chrID,$tmp_mates_prefix)=@_; - - my $tmp_links_prefix=$tmp_mates_prefix; - $tmp_links_prefix=~s/\/mates\//\/links\//; - - my %PAIR; #main hash table 2: pairs - - linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs - $CONF{general}{read_lengths}, - $CONF{detection}{window_size}, - $CONF{detection}{step_length}, - $tmp_mates_prefix, - $CONF{general}{input_format}, - $CONF{general}{sv_type}, - "$tmp_links_prefix.links.mapped" - ); - - getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons - "$tmp_links_prefix.links.unique"); - - defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links - $CONF{general}{input_format}, - $CONF{general}{sv_type}, - $CONF{general}{read_lengths}, - "$tmp_links_prefix.links.unique", - "$tmp_links_prefix.links.unique_defined"); - - sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates - "$tmp_links_prefix.links.sorted"); - - removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links - "$tmp_links_prefix.links",1); #file output - - - undef %PAIR; - - unlink("$tmp_links_prefix.links.mapped", - "$tmp_links_prefix.links.unique", - "$tmp_links_prefix.links.unique_defined", - "$tmp_links_prefix.links.sorted"); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub splitMateFile{ - - my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_; - - print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n"; - - my %filesHandle; - - #fichier matefile inter - if($sv_type=~/^(all|inter)$/){ - my $newFileName="$output_prefix.interchrs"; - push(@{$files_list},$newFileName); - my $fh = new FileHandle; - $fh->open(">$newFileName"); - $filesHandle{inter}=$fh; - } - - #fichiers matefiles intra - if($sv_type=~/^(all|intra)$/){ - foreach my $k (1..$chr->{nb_chrs}){ - my $newFileName=$output_prefix.".".$chr->{$k}->{name}; - push(@{$files_list},$newFileName); - my $fh = new FileHandle; - $fh->open(">$newFileName"); - $filesHandle{$k}=$fh; - } - } - - if ($mates_file =~ /.gz$/) { - open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat - }elsif($mates_file =~ /.bam$/){ - open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY - }else{ - open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; - } - - - while(){ - - my @t=split; - my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); - - next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); - - next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); - - ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); - - if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ - my $fh2print=$filesHandle{inter}; - print $fh2print join("\t",@t)."\n"; - } - - if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ - my $fh2print=$filesHandle{$chr_read1}; - print $fh2print join("\t",@t)."\n"; - - } - } - - foreach my $name (keys %filesHandle){ - my $fh=$filesHandle{$name}; - $fh->close; - } - - print LOG "# Splitted mate files of \"$mates_file\" created.\n"; -} - - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub filterlinks{ - - my %CHR; - my %CHRID; - my @LINKFILES; - my @FLINKFILES; - - my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; - my @path=split(/\//,$output_prefix); - $output_prefix=$CONF{general}{output_dir}.$path[$#path]; - my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; - - createChrHashTables(\%CHR,\%CHRID, - $CONF{general}{cmap_file}); - - if($CONF{filtering}{split_link_file}){ - - splitLinkFile(\%CHR, \%CHRID, \@LINKFILES, - $tmp_links_prefix, - $CONF{general}{sv_type}, - "$output_prefix.links", - ); - }else{ - - @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n"; - chomp(@LINKFILES); - print LOG "# Splitted link files already created.\n"; - } - - #Parallelization of the filtering per chromosome for intra + interchrs - my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); - - foreach my $linkfile (@LINKFILES){ - - my $pid = $pm->start and next; - getFilteredlinks(\%CHR, \%CHRID, $linkfile); - $pm->finish; - - } - $pm->wait_all_children; - - #Merge the chromosome links file into only one - @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n"; - chomp(@FLINKFILES); - catFiles( \@FLINKFILES => "$output_prefix.links.filtered" ); - - system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY - print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n"; - - undef %CHR; - undef %CHRID; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub splitLinkFile{ - - my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_; - - print LOG "# Splitting the link file for parallel processing...\n"; - - my %filesHandle; - - #fichier matefile inter - if($sv_type=~/^(all|inter)$/){ - my $newFileName="$input_prefix.interchrs.links"; - push(@{$files_list},$newFileName); - my $fh = new FileHandle; - $fh->open(">$newFileName"); - $filesHandle{inter}=$fh; - } - - #fichiers matefiles intra - if($sv_type=~/^(all|intra)$/){ - foreach my $k (1..$chr->{nb_chrs}){ - my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links"; - push(@{$files_list},$newFileName); - my $fh = new FileHandle; - $fh->open(">$newFileName"); - $filesHandle{$k}=$fh; - } - } - - open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n"; - while(){ - - my @t=split; - my ($chr_read1,$chr_read2)=($t[0],$t[3]); - - next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); - - ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); - - if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ - my $fh2print=$filesHandle{inter}; - print $fh2print join("\t",@t)."\n"; - } - - if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ - my $fh2print=$filesHandle{$chr_read1}; - print $fh2print join("\t",@t)."\n"; - - } - } - - foreach my $name (keys %filesHandle){ - my $fh=$filesHandle{$name}; - $fh->close; - } - - print LOG "# Splitted link files created.\n"; -} - - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 2: Filtering processing -sub getFilteredlinks { - - my ($chr,$chrID,$tmp_links_prefix)=@_; - my %PAIR; - - strandFiltering($chr,$chrID, - $CONF{filtering}{nb_pairs_threshold}, #filtering of links - $CONF{filtering}{strand_filtering}, - $CONF{filtering}{chromosomes}, - $CONF{general}{input_format}, - $CONF{general}{cmap_file}, - $CONF{general}{mates_orientation}, - $CONF{general}{read_lengths}, - $tmp_links_prefix, - "$tmp_links_prefix.filtered", - ); - - if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering - - my @tmpfiles; - - rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique"); - - getUniqueLinks("$tmp_links_prefix.filtered_unique", - "$tmp_links_prefix.filtered"); - - push(@tmpfiles,"$tmp_links_prefix.filtered_unique"); - - if($CONF{filtering}{order_filtering}){ #filtering using the order - - rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered"); - - orderFiltering($chr,$chrID, - $CONF{filtering}{nb_pairs_threshold}, - $CONF{filtering}{nb_pairs_order_threshold}, - $CONF{filtering}{mu_length}, - $CONF{filtering}{sigma_length}, - $CONF{general}{mates_orientation}, - $CONF{general}{read_lengths}, - "$tmp_links_prefix.filtered_ordered", - "$tmp_links_prefix.filtered", - ); - - push(@tmpfiles,"$tmp_links_prefix.filtered_ordered"); - } - - if (($CONF{filtering}{insert_size_filtering})&& - ($CONF{general}{sv_type} ne 'inter')){ - - rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize"); - - addInsertionInfo($chr,$chrID, - $CONF{filtering}{nb_pairs_threshold}, - $CONF{filtering}{order_filtering}, - $CONF{filtering}{indel_sigma_threshold}, - $CONF{filtering}{dup_sigma_threshold}, - $CONF{filtering}{singleton_sigma_threshold}, - $CONF{filtering}{mu_length}, - $CONF{filtering}{sigma_length}, - $CONF{general}{mates_orientation}, - $CONF{general}{read_lengths}, - "$tmp_links_prefix.filtered_withoutIndelSize", - "$tmp_links_prefix.filtered" - ); - - push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize"); - } - - sortLinks("$tmp_links_prefix.filtered", - "$tmp_links_prefix.filtered_sorted"); - - removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted", - "$tmp_links_prefix.filtered_nodup", - ); - - postFiltering("$tmp_links_prefix.filtered_nodup", - "$tmp_links_prefix.filtered", - $CONF{filtering}{final_score_threshold}); - - push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup"); - - unlink(@tmpfiles); - - - } - undef %PAIR; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 3: Circos format conversion for links -sub links2circos{ - - my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".segdup.txt"; - - links2segdup($CONF{circos}{organism_id}, - $CONF{circos}{colorcode}, - $input_file, - $output_file); #circos file output - - system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 4: Bed format conversion for links -sub links2bed{ - - my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".bed.txt"; - - links2bedfile($CONF{general}{read_lengths}, - $CONF{bed}{colorcode}, - $input_file, - $output_file); #bed file output - - system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 6: Bed format conversion for links -sub links2SV{ - - my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; - - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".sv.txt"; - - - links2SVfile( $input_file, - $output_file); - - system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 7: copy number variations, coverage ratio calculation -sub cnv{ - - my %CHR; - my %CHRID; - my @MATEFILES; - my @MATEFILES_REF; - - my $output_prefix=$CONF{general}{mates_file}; - my $output_prefix_ref=$CONF{detection}{mates_file_ref}; - my @path=split(/\//,$output_prefix); - my @path_ref=split(/\//,$output_prefix_ref); - $output_prefix=$CONF{general}{output_dir}.$path[$#path]; - $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref]; - my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; - my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref]; - my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path]; - - shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters - $CONF{detection}{window_size}, - $CONF{detection}{step_length}, - $CONF{general}{cmap_file}); - - if($CONF{detection}{split_mate_file}){ - - splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, - "intra", - $CONF{general}{mates_file}, - $CONF{general}{input_format}, - $CONF{general}{read_lengths} - ); - - splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref, - "intra", - $CONF{detection}{mates_file_ref}, - $CONF{general}{input_format}, - $CONF{general}{read_lengths} - ); - - - }else{ - - @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!"; - chomp(@MATEFILES); - @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!"; - chomp(@MATEFILES_REF); - print LOG "# Splitted sample and reference mate files already created.\n"; - } - - #Parallelization of the cnv per chromosome - my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); - - foreach my $file (0..$#MATEFILES){ - - my $pid = $pm->start and next; - - densityCalculation(\%CHR, \%CHRID, $file, - $CONF{general}{read_lengths}, - $CONF{detection}{window_size}, - $CONF{detection}{step_length}, - \@MATEFILES, - \@MATEFILES_REF, - $MATEFILES[$file].".density", - $CONF{general}{input_format}); - - $pm->finish; - - } - $pm->wait_all_children; - - #Merge the chromosome links file into only one - my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!"; - chomp(@DENSITYFILES); - catFiles( \@DENSITYFILES => "$output_prefix.density" ); - - print LOG "# cnv end procedure : output created: $output_prefix.density\n"; - - - undef %CHR; - undef %CHRID; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 8: Circos format conversion for cnv ratios -sub ratio2circos{ - - my $input_file=$CONF{general}{mates_file}.".density"; - - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".segdup.txt"; - - ratio2segdup($CONF{circos}{organism_id}, - $input_file, - $output_file); -} -#------------------------------------------------------------------------------# -#MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios -sub ratio2bedgraph{ - - my $input_file=$CONF{general}{mates_file}.".density"; - - my @path=split(/\//,$input_file); - $input_file=$CONF{general}{output_dir}.$path[$#path]; - - my $output_file.=$input_file.".bedgraph.txt"; - - ratio2bedfile($input_file, - $output_file); #bed file output -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Creation of the fragment library -sub shearingChromosome{ - - print LOG "# Making the fragments library...\n"; - - my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters - - createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name - - foreach my $k (1..$chr->{nb_chrs}){ - - print LOG"-- $chr->{$k}->{name}\n"; - - my $frag=1; - for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){ - - my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1; - $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage - - if($end==($chr->{$k}->{length})-1){ - $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome - last; - } - $frag++; - } - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Creation of chromosome hash tables from the cmap file -sub createChrHashTables{ - - my ($chr,$chrID,$cmap_file)=@_; - $chr->{nb_chrs}=0; - - open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n"; - while(){ - - if(/^\s+$/){ next;} - my ($k,$name,$length) = split; - $chr->{$k}->{name}=$name; - $chr->{$k}->{length}=$length; - $chrID->{$name}=$k; - $chr->{nb_chrs}++; - - } - close CMAP; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Read the mate file according the input format file (solid, eland or sam) -sub readMateFile{ - - my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_; - my ($strand1,$strand2); - - if($file_type eq "solid"){ - - ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based - - }else{ - my ($tag_length1,$tag_length2); - ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based - if($file_type eq "eland"); - - if($file_type eq "sam"){ - - return 0 if ($$t[0]=~/^@/); #header sam filtered out - - ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]); - - return 0 if (($$t[1]&0x0004) || ($$t[1]&0x0008)); - - $$chr2=$$chr1 if($$chr2 eq "="); - - $strand1 = (($$t[1]&0x0010))? 'R':'F'; - $strand2 = (($$t[1]&0x0020))? 'R':'F'; - - $$order1= (($$t[1]&0x0040))? '1':'2'; - $$order2= (($$t[1]&0x0080))? '1':'2'; - $tag_length1 = $tag_length->{$$order1}; - $tag_length2 = $tag_length->{$$order2}; - } - - $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts - $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R"); - } - return 1; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Parsing of the mates files and creation of links between 2 chromosomal fragments -sub linking{ - - my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_; - my %link; - - my $record=0; - my $nb_links=0; - my $warn=10000; - - my @sfile=split(/\./,$mates_file); - my $fchr=$sfile[$#sfile]; - - my $fh = new FileHandle; - - print LOG "# $fchr : Linking procedure...\n"; - print LOG "-- file=$mates_file\n". - "-- chromosome=$fchr\n". - "-- input format=$input_format\n". - "-- type=$sv_type\n". - "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n". - "-- window size=$window_dist, step length=$step\n"; - - if ($mates_file =~ /.gz$/) { - $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat - }elsif($mates_file =~ /.bam$/){ - $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY - }else{ - $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; - } - - - while(<$fh>){ - - my @t=split; #for each mate-pair - my $mate=$t[0]; - my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); - - next if(exists $$pair{$mate}); - - next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); - - next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); - - ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); - - if($sv_type ne "all"){ - if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) || - ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){ - }else{ - next; - } - } - - $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function) - - $record++; - - my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read - - recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); - recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); - - for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation - - if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ - - if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){ - - for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){ - - if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) { - - if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){ - - makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link - } - - }else{ - - $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step); - } - } - } - - }else{ - - $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); - } - } - - if($record>=$warn){ - print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n"; - $warn+=10000; - } - } - $fh->close; - - if(!$nb_links){ - print LOG "-- $fchr : No mate-pairs !\n". - "-- $fchr : No links have been found with the selected type of structural variations $$sv_type$\n"; - } - - print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n"; - - print LOG "-- $fchr : writing...\n"; - - $fh = new FileHandle; - - $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n"; - - foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output - - foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){ - - foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){ - - foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){ - - my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2}); - print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t". - "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t". - scalar @count."\t". #nb of read - $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list - } - } - } - } - - $fh->close; - - undef %link; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#remove exact links doublons according to the mate list -sub getUniqueLinks{ - - my ($links_file,$nrlinks_file)=@_; - my %links; - my %pt; - my $nb_links; - my $n=1; - - my $record=0; - my $warn=300000; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - my $fh = new FileHandle; - - print LOG "# $fchr : Getting unique links...\n"; - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - - while(<$fh>){ - - my @t=split; - my $mates=$t[7]; - $record++; - - if(!exists $links{$mates}){ #Unique links selection - - $links{$mates}=[@t]; - $pt{$n}=$links{$mates}; - $n++; - - - }else{ #get the link coordinates from the mate-pairs list - - for my $i (1,2,4,5){ #get the shortest regions - - $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start - if($i==1 || $i==4); - $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end - if($i==2 || $i==5); - } - } - if($record>=$warn){ - print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n"; - $warn+=300000; - } - } - $fh->close; - - $nb_links=$n-1; - print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n"; - - $fh = new FileHandle; - $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; - print LOG "-- $fchr : writing...\n"; - for my $i (1..$nb_links){ - - print $fh join("\t",@{$pt{$i}})."\n"; #all links output - } - - $fh->close; - - undef %links; - undef %pt; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#get the new coordinates of each link from the mate list -sub defineCoordsLinks{ - - my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n"; - - print LOG "# $fchr : Defining precise link coordinates...\n"; - - my $record=0; - my $warn=100000; - - my %coords; - my %strands; - my %order; - my %ends_order; - - while(<$fh>){ - - - my ($col1,$col2)=(1,2); #for an intrachromosomal link - my $diffchr=0; #difference between chr1 and chr2 - my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8]; - ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); - if ($chr1 != $chr2){ #for an interchromosomal link - $col1=$col2=0; #no distinction - $diffchr=1; - } - - my @pairs=split(",",$mates_list); - - $coords{$col1}{$chr1}->{start}=undef; - $coords{$col1}{$chr1}->{end}=undef; - $coords{$col2}{$chr2}->{start}=undef; - $coords{$col2}{$chr2}->{end}=undef; - $strands{$col1}{$chr1}=undef; - $strands{$col2}{$chr2}=undef; - $ends_order{$col1}{$chr1}=undef; - $ends_order{$col2}{$chr2}=undef; - - - $order{$col1}{$chr1}->{index}->{1}=undef; - $order{$col1}{$chr1}->{index}->{2}=undef; - $order{$col2}{$chr2}->{index}->{1}=undef; - $order{$col2}{$chr2}->{index}->{2}=undef; - $order{$col1}{$chr1}->{order}=undef; - $order{$col2}{$chr2}->{order}=undef; - - $record++; - - for my $p (0..$#pairs){ #for each pair - - my ($coord_start_read1,$coord_end_read1); - my ($coord_start_read2,$coord_end_read2); - my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format); - my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format); - - if(!$diffchr){ #for a intrachromosomal link - if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column - ($col1,$col2)=(2,1); - }else{ - ($col1,$col2)=(1,2); - } - } - - push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads - push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1); - push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2); - push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2); - push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1); - push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2); - push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]); - push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]); - } - - ($col1,$col2)=(1,2) if(!$diffchr); - - my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region - my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}})); - my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}})); - my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}})); - - @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}}; - @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}}; - - foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting) - foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){ - - if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){ - ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i; - ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j; - last; - } - } - } - - foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID - foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){ - - if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){ - ${$order{$col1}{$chr1}->{order}}[$i]=$i+1; - ${$order{$col2}{$chr2}->{order}}[$i]=$j+1; - last; - } - } - } - - @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads - @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1}); - @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2}); - @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1}); - @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2}); - @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start}); - @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start}); - - - my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output - $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, - scalar @pairs, - join(",",@pairs), - join(",",@{$strands{$col1}{$chr1}}), - join(",",@{$strands{$col2}{$chr2}}), - join(",",@{$ends_order{$col1}{$chr1}}), - join(",",@{$ends_order{$col2}{$chr2}}), - join(",",@{$order{$col1}{$chr1}->{order}}), - join(",",@{$order{$col2}{$chr2}->{order}}), - join(",",@{$coords{$col1}{$chr1}->{start}}), - join(",",@{$coords{$col2}{$chr2}->{start}})); - - print $fh2 join("\t",@link)."\n"; - - if($record>=$warn){ - print LOG "-- $fchr : $warn links processed\n"; - $warn+=100000; - } - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : Total : $record links processed\n"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Sort links according the concerned chromosomes and their coordinates -sub sortLinks{ - - my ($links_file,$sortedlinks_file,$unique)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - - print LOG "# $fchr : Sorting links...\n"; - - my $pipe=($unique)? "| sort -u":""; - system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#removal of fully overlapped links -sub removeFullyOverlappedLinks{ - - my ($links_file,$nrlinks_file,$warn_out)=@_; - - my %pt; - my $n=1; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - my $fh = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - while(<$fh>){ - - my @t=split("\t",$_); - $pt{$n}=[@t]; - $n++; - } - $fh->close; - - my $nb_links=$n-1; - my $nb=$nb_links; - - my %pt2; - my $nb2=1; - my $record=0; - my $warn=10000; - - print LOG "# $fchr : Removing fully overlapped links...\n"; - - LINK: - - for my $i (1..$nb){ - - my @link=(); - my @next_link=(); - my $ind1=$i; - - $record++; - if($record>=$warn){ - print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n"; - $warn+=10000; - } - - if(exists $pt{$ind1}){ - @link=@{$pt{$ind1}}; #link1 - }else{ - next LINK; - } - - my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1 - my @mates=deleteBadOrderSensePairs(split(",",$link[7])); - - my $ind2=$ind1+1; - $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link - - if($ind2<=$nb){ - - @next_link=@{$pt{$ind2}}; #link2 - my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2 - my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); - - while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter - - if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2 - - $ind2++; - $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); - - if($ind2>$nb){ #if no more link in the file -> save link1 - - $pt2{$nb2}=\@link; - $nb2++; - next LINK; - } - - @next_link=@{$pt{$ind2}}; - ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); - @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); - next; - } - - my %mates=map{$_ =>1} @mates; #get the equal number of mates - my @same_mates = grep( $mates{$_}, @next_mates ); - my $nb_mates= scalar @same_mates; - - if($nb_mates == scalar @mates){ - - delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1 - next LINK; #go to link2, link2 becomes link1 - - }else{ - delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2 - $ind2++; #we continue by checking the next link2 - $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); - - if($ind2>$nb){ #if no more link in the file -> save link1 - - $pt2{$nb2}=\@link; - $nb2++; - next LINK; - } - - @next_link=@{$pt{$ind2}}; #get info of link2 - ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); - @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); - - } - } - } - $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1 - $nb2++; - } - - print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n"; - - #OUTPUT - - $fh = new FileHandle; - $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; - print LOG "-- $fchr : writing...\n"; - for my $i (1..$nb2-1){ - - print $fh join("\t",@{$pt2{$i}}); #all links output - } - - close $fh; - - print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out); - - undef %pt; - undef %pt2; -} -#------------------------------------------------------------------------------# -sub postFiltering { - - my ($links_file,$pflinks_file, $finalScore_thres)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - - my ($nb,$nb2)=(0,0); - - print LOG "# $fchr : Post-filtering links...\n"; - print LOG "-- $fchr : final score threshold = $finalScore_thres\n"; - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n"; - - - while(<$fh>){ - - my @t=split("\t",$_); - my $score=$t[$#t-1]; - - if($score >= $finalScore_thres){ - print $fh2 join("\t", @t); - $nb2++; - } - $nb++; - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n"; - print LOG "-- $fchr : output created: $pflinks_file\n"; -} - - - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Filtering of the links -sub strandFiltering{ - - my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes, - $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-1]; - - - my %chrs; - my %chrs1; - my %chrs2; - my $nb_chrs; - my $exclude; - - if($chromosomes){ - my @chrs=split(",",$chromosomes); - $nb_chrs=scalar @chrs; - $exclude=($chrs[0]=~/^\-/)? 1:0; - for my $chrName (@chrs){ - $chrName=~s/^(\-)//; - my $col=($chrName=~s/_(1|2)$//); - - if(!$col){ - $chrs{$chrID->{$chrName}}=undef - }else{ - $chrs1{$chrID->{$chrName}}=undef if($1==1); - $chrs2{$chrID->{$chrName}}=undef if($1==2); - } - } - } - - my $record=0; - my $nb_links=0; - my $warn=10000; - - my $sens_ratio_threshold=0.6; - - print LOG "\# Filtering procedure...\n"; - print LOG "\# Number of pairs and strand filtering...\n"; - print LOG "-- file=$links_file\n"; - print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no"). - ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n"; - - - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; - - while(<$fh>){ - - my @t=split; #for each link - my $is_good=1; - $record++; - - - if($chromosomes){ - - my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]}); - - if(!$exclude){ - $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0; - $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good); - $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good); - }else{ - $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1; - $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good); - } - } - - $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs - if($is_good && $strand_filtering){ #if filtering according the strand sense - - my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense - my @strands1=split(/,/,$t[8]); - my @strands2=split(/,/,$t[9]); - - my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0); - - my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR - 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF - - my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE", - substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE", - inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE"); - - if($t[6] == 1){ - - push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1); - - }else{ - - tie (my %class,'Tie::IxHash'); - my $split; - - foreach my $i (0..$#mates){ - $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group - } - - my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR}; - my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF}; - my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class); - - if($sens_ratio < $sens_ratio_threshold){ - %class=(1=>'FF', 2=>'FR'); - $split=1; - }else{ - $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class - $split=0; - } - - $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold); - } - } - - if($is_good){ #PRINT - - my $nb=scalar @t; - if($nb > 20){ - my @t2=splice(@t,0,20); - print $fh2 join("\t",@t2)."\n"; - $nb_links++; - } - $nb_links++; - print $fh2 join("\t",@t)."\n"; - } - - if($record>=$warn){ - print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; - $warn+=10000; - } - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links); - - print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; - - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getConsistentSenseLinks{ - - my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_; - - my $npairs=scalar @$mates; - - my @ends_order1 = split (/,/,$$t[10]); - my @ends_order2 = split (/,/,$$t[11]); - my @order1 = split (/,/,$$t[12]); - my @order2 = split (/,/,$$t[13]); - my @positions1 = split (/,/,$$t[14]); - my @positions2 = split (/,/,$$t[15]); - - my @newlink; - - foreach my $ind (keys %{$class} ){ - - tie (my %flink,'Tie::IxHash'); - my @orders2remove=(); - - foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group - - if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){ - push(@{$flink{mates}},$$mates[$i]); - push(@{$flink{strands1}},$$strands1[$i]); - push(@{$flink{strands2}},$$strands2[$i]); - push(@{$flink{ends_order1}},$ends_order1[$i]); - push(@{$flink{ends_order2}},$ends_order2[$i]); - push(@{$flink{positions1}},$positions1[$i]); - push(@{$flink{positions2}},$positions2[$i]); - - }else{ - push(@orders2remove,$order1[$i]); - } - } - - @{$flink{order1}}=(); - @{$flink{order2}}=(); - if(scalar @orders2remove > 0){ - getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2}) - }else{ - @{$flink{order1}}=@order1; - @{$flink{order2}}=@order2; - } - - my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length); - my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length); - - my $fnpairs=scalar @{$flink{mates}}; - my $strand_filtering_ratio=$fnpairs."/".$npairs; - my $real_ratio=$fnpairs/$npairs; - - if($fnpairs>=$thres){ #filtering according the number of pairs - - push(@newlink, - $$t[0], - min(min(@{$flink{positions1}}),min(@ends1)), - max(max(@{$flink{positions1}}),max(@ends1)), - $$t[3], - min(min(@{$flink{positions2}}),min(@ends2)), - max(max(@{$flink{positions2}}),max(@ends2)), - $fnpairs, - join(",",@{$flink{mates}}), - join(",",@{$flink{strands1}}), - join(",",@{$flink{strands2}}), - join(",",@{$flink{ends_order1}}), - join(",",@{$flink{ends_order2}}), - join(",",@{$flink{order1}}), - join(",",@{$flink{order2}}), - join(",",@{$flink{positions1}}), - join(",",@{$flink{positions2}}), - $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]}, - $strand_filtering_ratio, - $real_ratio, - $npairs - ); - } - } - - if (grep {defined($_)} @newlink) { - @$t=@newlink; - return 1 - } - return 0; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getNewOrders{ - - my($tab1,$tab2,$list,$newtab1,$newtab2)=@_; - my $j=1; - my $k=1; - for my $i (0..$#{$tab2}){ - my $c=0; - for my $j (0..$#{$list}){ - $c++ if(${$list}[$j] < ${$tab2}[$i]); - if(${$list}[$j] == ${$tab2}[$i]){ - $c=-1; last; - } - } - if($c!=-1){ - push(@{$newtab2}, ${$tab2}[$i]-$c); - push(@{$newtab1}, $k); - $k++; - } - } -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#Filtering of the links using their order -sub orderFiltering { - - my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - - my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; - - my $record=0; - my $warn=10000; - my $nb_links=0; - - my $quant05 = 1.644854; - my $quant001 = 3.090232; - my $alphaDist = $quant05 * 2 * $sigma; - my $maxFragmentLength = &floor($quant001 * $sigma + $mu); - - print LOG "\# Filtering by order...\n"; - print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n"; - print LOG "-- distance between comparable pairs was set to $alphaDist\n"; - print LOG "-- maximal fragment length was set to $maxFragmentLength\n"; - - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; - - while(<$fh>){ - - $record++; - my @t = split; - my ($chr1,$chr2,$mates_list)=@t[0,3,7]; - my @pairs=split(",",$mates_list); - ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); - my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5]; - my $numberOfPairs = $t[6]; - my @strand1 = split (/,/,$t[8]); - my @strand2 = split (/,/,$t[9]); - my @ends_order1 = split (/,/,$t[10]); - my @ends_order2 = split (/,/,$t[11]); - my @order1 = split (/,/,$t[12]); - my @order2 = split (/,/,$t[13]); - my @positions1 = split (/,/,$t[14]); - my @positions2 = split (/,/,$t[15]); - my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length); - my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length); - my $clusterCoordinates_chr1; - my $clusterCoordinates_chr2; - my $reads_left = 0; - - my $ifRenv = $t[16]; - my $strand_ratio_filtering=$t[17]; - - #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if orientation is correct - my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense); - #find pairs type F-RRRR or FFFF-R in the case if orientation is correct - #These pairs are annotated as BED pairs forever! They won't be recycled! - my $table; - for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs - my $nonAdeq = 0; - for my $j (0..$i-1) { - if (exists($table->{$j}->{$i})) { - $nonAdeq++; - $table->{$i}->{$j} = 1; - } - } - for my $j ($i+1..$numberOfPairs-1) { - if ($positions1[$j]-$positions1[$i]>$alphaDist) { - if (&reversed ($i,$j,$ifRenv,\@positions2)) { - $nonAdeq++; - $table->{$i}->{$j} = 1; - } - } - } - $table->{$i}->{nonAdeq} = $nonAdeq; - } - - for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of orientation - &remove($bad,$table); - } - - my @falseReads; - #RRRR-F -> RRRR or R-FFFF -> FFFF - @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table}); - #these pairs will be recycled later as $secondTable - for my $bad (@falseReads) { - &remove($bad,$table); - } - - my $bad = &check($table); - while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER - # push (@falseReads, $bad); remove completely!!! - &remove($bad,$table); - $bad = &check($table); - } - - $reads_left = scalar keys %{$table}; - my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}])); - my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}])); - my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}])); - my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}])); - - $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; - $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; - - my $ifBalanced = 'UNBAL'; - my $secondTable; - my $clusterCoordinates; - my ($break_pont_chr1,$break_pont_chr2); - - my $signatureType=""; - - my $maxCoord1 =$chr->{$chr1}->{length}; - my $maxCoord2 =$chr->{$chr2}->{length}; - - if (scalar @falseReads) { - @falseReads = sort @falseReads; - #now delete FRFR choosing the majority - my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF - @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled - my %hashTmp; - for my $count1 (0..scalar(@falseReads)-1) { - my $i = $falseReads[$count1]; - $hashTmp{$i} = 1; - for my $bad (@newfalseReads) { - if ($bad == $i) { - delete $hashTmp{$i}; - next; - } - } - } - @falseReads = sort keys %hashTmp; #what is left - for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected - my $nonAdeq = 0; - my $i = $falseReads[$count1]; - - for my $count2 (0..$count1-1) { - my $j = $falseReads[$count2]; - if (exists($secondTable->{$j}->{$i})) { - $nonAdeq++; - $secondTable->{$i}->{$j} = 1; - } - } - for my $count2 ($count1+1..scalar(@falseReads)-1) { - my $j = $falseReads[$count2]; - if ($positions1[$j]-$positions1[$i]>$alphaDist) { - if (&reversed ($i,$j,$ifRenv,\@positions2)) { - $nonAdeq++; - $secondTable->{$i}->{$j} = 1; - } - } - } - $secondTable->{$i}->{nonAdeq} = $nonAdeq; - } - - my @falseReads2; - my $bad = &check($secondTable); - while ($bad ne "OK") { #clear the table to reject non adequate pairs - push (@falseReads2, $bad); - &remove($bad,$secondTable); - $bad = &check($secondTable); - } - if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) { - my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}])); - my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}])); - my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}])); - my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}])); - - $ifBalanced = 'BAL'; - - if ($ifBalanced eq 'BAL') { - - if (scalar keys %{$table} < $nb_pairs_order_threshold) { - $ifBalanced = 'UNBAL'; #kill cluster 1! - ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 - $reads_left = scalar keys %{$table}; - $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; - $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; - $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; - $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; - $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; - $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; - - } else { - - $reads_left += scalar keys %{$secondTable}; - next if ($reads_left < $nb_pairs_threshold); - - if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) { - ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 - - ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1); - ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1); - ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1); - ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1); - - $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1; - $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2; - }else { - $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')'; - $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')'; - } - $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2); - $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2); - $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2); - $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2); - #to calculate breakpoints one need to take into account read orientation in claster.. - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - - my @index1 = keys %{$table}; - my @index2 = keys %{$secondTable}; - - my (@generalStrand1,@generalStrand2) = 0; - - if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs - $leftLetterOk = 'R'; - $rightLetterOk = 'F'; - @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1); - @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2); - } else { - @generalStrand1 = @strand1; - @generalStrand2 = @strand2; # TODO check if it is correct - } - if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F) - $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')'; - - if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { - if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { - $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; - $signatureType = "TRANSLOC"; - } else { - $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; - $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; - $signatureType = "INS_FRAGMT"; - } - - } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { - if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { - $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; - $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; - $signatureType = "INV_INS_FRAGMT"; - } else { - $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; - $signatureType = "INV_TRANSLOC"; - } - } else { - #should not occur - print STDERR "\nError in orderFiltering\n\n"; - } - } - - elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R) - $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')'; - $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')'; - if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { - if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { - $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; - $signatureType = "INV_INS_FRAGMT"; - } else { - $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; - $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; - $signatureType = "INV_COAMPLICON"; - } - - } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { - if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { - $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; - $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; - $signatureType = "COAMPLICON"; - } else { - $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; - $signatureType = "INS_FRAGMT"; - } - } else { - #should not occur - $signatureType = "UNDEFINED"; - } - } - else { # (F,F) or (R,R) something strange. We will discard the smallest cluster - $ifBalanced = 'UNBAL'; - if (scalar keys %{$secondTable} > scalar keys %{$table}) { - ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 - - $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; - $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; - $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; - $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; - $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; - $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; - } - $reads_left = scalar keys %{$table}; - } - if ($ifBalanced eq 'BAL') { - $ifRenv = $signatureType; - } - } - } - } - } - if ($ifBalanced ne 'BAL') { - #define possible break point - $coord_start_chr1 = $coord_start_chr1_cluster1; - $coord_end_chr1 = $coord_end_chr1_cluster1; - $coord_start_chr2 = $coord_start_chr2_cluster1; - $coord_end_chr2 = $coord_end_chr2_cluster1; - - my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1; - my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2; - - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - my @index = keys %{$table}; - unless ($diff_sense_ends) { - my $firstEndOrder1 = $ends_order1[$index[0]]; - my $firstEndOrder2 = $ends_order2[$index[0]]; - $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; - $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; - } else { - $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; - $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; - } - - if ($chr1 ne $chr2){ - $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE"); - $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE"); - } - } - - if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) { - next; #discard the link - } - if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) { - next; #discard the link - } - my $ratioTxt = "$reads_left/".(scalar @pairs); - my ($n1,$nTot) = split ("/",$strand_ratio_filtering); - my $ratioReal = $reads_left/$nTot; - - if ($coord_start_chr1<=0) { - $coord_start_chr1=1; - } - if ($coord_start_chr2<=0) { - $coord_start_chr2=1; - } - #create output - my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output - $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, - $reads_left, - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2), - &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1), - &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1), - &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2), - $ifRenv, - $strand_ratio_filtering, - $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2, - $ratioReal, $nTot); - - $nb_links++; - print $fh2 join("\t",@link)."\n"; - - if($record>=$warn){ - print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; - $warn+=10000; - } - - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#gets information about ends positions given start, direction and order -sub getEnds { - my ($ends,$starts,$strand,$end_order,$tag_length) = @_; - for my $i (0..scalar(@{$starts})-1) { - $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length); - } -} -sub getEnd { - my ($start,$strand, $end_order,$tag_length) = @_; - return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#gets starts and ends Coords when start=leftmost given positions, directions and orders -sub getCoordswithLeftMost { - - my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; - - for my $i (0..scalar(@{$positions})-1) { - - if($strand->[$i] eq 'F'){ - $starts->[$i]=$positions->[$i]; - $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; - }else{ - $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; - $ends->[$i]=$positions->[$i]; - } - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering - - my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; - - my @sfile=split(/\./,$links_file); - my $fchr=$sfile[$#sfile-2]; - - - my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; - - my $record=0; - my $nb_links=0; - my $warn=10000; - - print LOG "\# Filtering out normal pairs using insert size...\n"; - print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n"; - print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-". - ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n"; - print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-". - ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n"; - print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF"); - - my $fh = new FileHandle; - my $fh2 = new FileHandle; - - $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; - $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; - - while(<$fh>){ - - $record++; - my @t = split; - my ($chr1,$chr2,$mates_list)=@t[0,3,7]; - - if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all), - $nb_links++; - - $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE"); - $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE"); - - $t[16].= "\t"; - $t[19].= "\t"; - - print $fh2 join("\t",@t)."\n"; - - if($record>=$warn){ - print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; - $warn+=10000; - } - next; - } - - my $ifRenv = $t[16]; - my $ifBalanced = "UNBAL"; - $ifBalanced = $t[18] if ($order_filtering); - - my $numberOfPairs = $t[6]; - my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14])); - my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15])); - - if ($ifBalanced eq "BAL") { - - if ($ifRenv eq "INV_TRANSLOC") { - $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment - } - if ($ifRenv eq "NORMAL_SENSE") { - $ifRenv = "TRANSLOC"; - } - if ($ifRenv eq "REVERSE_SENSE") { - $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment - } - $t[19].= "\t"; - - my $meanDistance = 0; - - for my $i (0..$numberOfPairs-1) { - $meanDistance += $positions2[$i]-$positions1[$i]; - } - $meanDistance /= $numberOfPairs; - - $t[16] = $ifRenv."\t".$meanDistance; - #dont touch the annotation. It should be already OK. - - } else { - #only for unbalanced - - my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]); - - my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0]. - (deleteBadOrderSensePairs(split (/,/,$t[9])))[0]; - my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0]. - (deleteBadOrderSensePairs(split (/,/,$t[11])))[0]; - - my $indel_type = $ifRenv; - - my $meanDistance = "N/A"; - - ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster - \@positions1, #assign N/A to $indel_type if unknown - \@positions2, - $ifRenv, - $ifoverlap, - $indel_sigma_threshold, - $dup_sigma_threshold, - $singleton_sigma_threshold, - $mu, - $sigma, - $ifBalanced, - $ends_sense_class, - $ends_order_class, - $mate_sense, - $diff_sense_ends, - ); - - #filtering of pairs with distance inconsistant with the SV - if ($ifRenv ne "REVERSE_SENSE") { - my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length}; - my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length}; - $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense, - $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering); - next if ($t[6] < $nb_pairs_threshold); - }else{ - $t[19].= "\t"; - } - $t[16] = $indel_type."\t".$meanDistance; - } - - $nb_links++; - - print $fh2 join("\t",@t)."\n"; - if($record>=$warn){ - print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; - $warn+=10000; - } - } - $fh->close; - $fh2->close; - - print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub checkIndel { - - my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold, - $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_; - - my $meanDistance = 0; - - for my $i (0..$numberOfPairs-1) { - $meanDistance += $positions2->[$i]-$positions1->[$i]; - } - $meanDistance /= $numberOfPairs; - - return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) ); - - return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE"); - - if($diff_sense_ends){ - return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; - return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense))); - }else{ - return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) && - ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; - } - - return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap); - - return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ; - - return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma); - return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma); - - return ($meanDistance, "UNDEFINED"); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size ) -sub recalc_t_usingInsertSizeInfo { - my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_; - - my @badPairs; - - my @positions1 = getAllEntries($t->[14]); - my @positions2 = getAllEntries($t->[15]); - - if ($meanDistance < $mu) { - for my $i (0..scalar(@positions1)-1) { - if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) { - push(@badPairs,$i); - } - } - } else { - for my $i (0..scalar(@positions1)-1) { - if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) { - push(@badPairs,$i); - } - } - } - - if (scalar (@badPairs)>0) { - #print join("\t",@badPairs).": ".join("\t",@t)."\n"; - #remove these inconsistant links - $t->[6] -= scalar(@badPairs); #numberOfPairs - return if ($t->[6] < $nb_pairs_threshold); - - $t->[7] = mark_values(\@badPairs, $t->[7]); - $t->[8] = mark_values(\@badPairs, $t->[8]); - $t->[9] = mark_values(\@badPairs, $t->[9]); - $t->[10] = mark_values(\@badPairs, $t->[10]); - $t->[11] = mark_values(\@badPairs, $t->[11]); - - $t->[12] = mark_indexes(\@badPairs, $t->[12]); - $t->[13] = mark_indexes(\@badPairs, $t->[13]); - - $t->[14] = mark_values(\@badPairs, $t->[14]); - $t->[15] = mark_values(\@badPairs, $t->[15]); - $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio - $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering); - ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length); - ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length); - - #recalc breakpoints: - my $quant001 = 3.090232; - my $maxFragmentLength = &floor($quant001 * $sigma + $mu); - $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); - $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); - #recalc total ratio - $t->[22] = $t->[6] / $t->[23] if ($order_filtering); - $t->[18] = $t->[6] / $t->[19] unless ($order_filtering); - - @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14])); - @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15])); - - $meanDistance = 0; - - for my $i (0..scalar(@positions1)-1) { - $meanDistance += $positions2[$i]-$positions1[$i]; - } - $meanDistance /= scalar(@positions1); - - } else { - $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering); - $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering); - - } #nothing has been filtered - return $meanDistance; -} - -sub recalculate_ratio { - my ($left, $ratio) = @_; - my @elements = split (/\//,$ratio); - $elements[1]= $elements[0]; - $elements[0]=$left; - return $ratio."\t".join("/",@elements); -} - -sub recalc_breakpoints { - my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_; - my $break_pont_chr; - - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - - my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); - - unless ($diff_sense_ends) { - $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; - } else { - $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; - } - return $break_pont_chr; -} -sub recalculate_boundaries { - my ($startString,$senseString,$endsOrderString,$tag_length) = @_; - my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); - my @strands = deleteBadOrderSensePairs(split (/,/,$senseString)); - my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString)); - my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length); - my $coord_start_cluster = min(min(@positions),min(@ends)); - my $coord_end_cluster = max(max(@positions),max(@ends)); - return ($coord_start_cluster,$coord_end_cluster); -} - -sub remove_indexes { - my ($bads, $string) = @_; - my @elements = deleteBadOrderSensePairs(split (/,/,$string)); - for my $i (reverse sort %{$bads}) { - delete $elements[$i]; - } - return "(".join(",",@elements).")"; -} -##add @ to to elements -sub mark_values { - my ($bads, $string) = @_; - my @elements = getAllEntries($string); - for my $i (@{$bads}) { - $elements[$i] .= "@"; - } - return "(".join(",",@elements).")"; -} -##add @ to to indexes -sub mark_indexes { - my ($bads, $string) = @_; - my @elements = getAllEntries($string); - for my $i ((0..scalar(@elements)-1)) { - for my $j (@{$bads}) { - $elements[$i] .= "@" if ($elements[$i] eq ($j+1)); - } - } - - return "(".join(",",@elements).")"; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub redraw { - - my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_; - - my $out; - my @first_arr; - if ($ifBalanced eq 'BAL') { - my @second_arr; - my $lastPushed = 1; - if ($type == 1) { - for my $i (0 .. scalar(@{$arr})-1) { - if (exists ($table->{$i})) { - push(@first_arr,$arr->[$i]); - $lastPushed = 1; - }elsif (exists ($secondTable->{$i})) { - push(@second_arr,$arr->[$i]); - $lastPushed = 2; - } elsif ($lastPushed == 1) { - if (exists ($badInFRSense->{$i})) { - push(@first_arr,$arr->[$i]."\$"); - }else { - push(@first_arr,$arr->[$i]."*"); - } - } elsif ($lastPushed == 2) { - if (exists ($badInFRSense->{$i})) { - push(@second_arr,$arr->[$i]."\$"); - }else { - push(@second_arr,$arr->[$i]."*"); - } - } else {print "Error!";exit;} - } - } else { - for my $i (@{$arr}) { - if (exists ($table->{$i-1})) { - push(@first_arr,$i); - $lastPushed = 1; - }elsif (exists ($secondTable->{$i-1})) { - push(@second_arr,$i); - $lastPushed = 2; - } elsif ($lastPushed == 1) { - if (exists ($badInFRSense->{$i-1})) { - push(@first_arr,$i."\$"); - }else { - push(@first_arr,$i."*"); - } - } elsif ($lastPushed == 2) { - if (exists ($badInFRSense->{$i-1})) { - push(@second_arr,$i."\$"); - }else { - push(@second_arr,$i."*"); - } - } else {print "Error!";exit;} - } - } - $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')'; - } - else { - if ($type == 1) { - for my $i (0 .. scalar(@{$arr})-1) { - if (exists ($table->{$i})) { - push(@first_arr,$arr->[$i]); - } else { - if (exists ($badInFRSense->{$i})) { - push(@first_arr,$arr->[$i]."\$"); - }else { - push(@first_arr,$arr->[$i]."*"); - } - } - } - } else { - for my $i (@{$arr}) { - if (exists ($table->{$i-1})) { - push(@first_arr,$i); - } else { - if (exists ($badInFRSense->{$i-1})) { - push(@first_arr,$i."\$"); - }else { - push(@first_arr,$i."*"); - } - } - } - } - $out = '('.join(",",@first_arr).')'; - } - return $out; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub check { - - my $table = $_[0]; - my $bad = 'OK'; - my $max = 0; - for my $i (sort {$a<=>$b} keys %{$table}) { - unless ($table->{$i}->{nonAdeq} == 0) { - if ($max<$table->{$i}->{nonAdeq}) { - $max=$table->{$i}->{nonAdeq}; - $bad = $i; - } - } - } - return $bad; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub reversed { - - my ($i,$j,$ifRenv,$positions) = @_; - if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){ - return 1; - } - return 0; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub remove { - - my ($bad,$table) = @_; - for my $i (sort {$a<=>$b} keys %{$table}) { - if ($bad == $i) { - delete($table->{$i});; - } else { - if (exists($table->{$i}->{$bad})) { - delete($table->{$i}->{$bad}); - $table->{$i}->{nonAdeq}--; - } - } - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs - - my ($strand,$ends_order,$mate_sense,@keysLeft) = @_; - - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - my (@standardArray); - if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs - $leftLetterOk = 'R'; - $rightLetterOk = 'F'; - @standardArray = translateSolidToRF($strand,$ends_order); - } else { - @standardArray = @{$strand}; - } - - my $ifR = 0; - my @Rs; - - for my $i (@keysLeft) { - if ($standardArray[$i] eq $leftLetterOk) { - $ifR++; - push(@Rs,$i); - } - } - - - my $ifF = 0; - my @Fs; - - for my $i (@keysLeft) { - if ($standardArray[$i] eq $rightLetterOk) { - $ifF++; - push(@Fs,$i); - } - } - - if($ifR>=$ifF) { - return @Fs; - } - return @Rs; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa - - my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]); - my $mate_sense = $_[6]; - - my $leftLetterOk = substr($mate_sense, 0, 1); #R - my $rightLetterOk = substr($mate_sense, 1, 1); #F - - my (@standardArray1,@standardArray2); - - if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs - $leftLetterOk = 'R'; - $rightLetterOk = 'F'; - @standardArray1 = translateSolidToRF($strand1,$ends_order1); - my @arr = getOrderedStrands($strand2,$order2); - my @ends2 = getOrderedStrands($ends_order2,$order2); - @standardArray2 = translateSolidToRF(\@arr,\@ends2); - - } else { - @standardArray1 = @{$strand1}; - @standardArray2 = getOrderedStrands($strand2,$order2); - } - - #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF - - #for the first end: - - my @array = @standardArray1; - my %badInFRSense1; - for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF - if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { - $badInFRSense1{$i}=1; - $array[$i] = $rightLetterOk; - } - } - my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1); - @array = @standardArray1; - my %badInFRSense0; - for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR - if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { - $badInFRSense0{$i-1}=1; - $array[$i-1] = $leftLetterOk; - - } - } - my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0); - - #for the second end: - @array = @standardArray2; - - my %badInFRSense3; - for my $i (1..scalar(@array)-1){ - if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { - $badInFRSense3{$order2->[$i]}=1; - $array[$i] = $rightLetterOk; - } - } - my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3); - - @array = @standardArray2; - my %badInFRSense5; - for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF - if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { - $badInFRSense5{$i-1}=1; - $array[$i-1] = $leftLetterOk; - } - } - my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5); - - if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) { - return (1,%badInFRSense0); - } - - if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) { - return (1,%badInFRSense1); - } - - if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) { - return (2,%badInFRSense3); - } - - if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) { - return (2,%badInFRSense5); - } - - #should not get here: - print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n"; - return (1,%badInFRSense1); -} - -sub getOrderedStrands { - my ($strand,$order) = ($_[0],$_[1]); - my @arr; - for my $i (0..scalar(@{$strand})-1) { - push(@arr,$strand->[$order->[$i]-1]); - } - return @arr; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub checkClusters { - - my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_; - if ($ifRenv eq 'REVERSE_SENSE') { - if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { - return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; - } - return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; - } - #if NORM - if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { - return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; - } - return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub translateSolidToRF { - my ($strandArr,$ends_orderArr)=@_; - my @array; - for my $i (0..scalar(@{$strandArr})-1) { - if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') { - push(@array,'F'); - } - if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') { - push(@array,'R'); - } - if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') { - push(@array,'R'); - } - if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') { - push(@array,'F'); - } - } - return @array; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the circos format -sub links2segdup{ - - my($id,$color_code,$links_file,$segdup_file)=@_; - - print LOG "# Converting to the circos format...\n"; - - tie (my %hcolor,'Tie::IxHash'); #color-code hash table - foreach my $col (keys %{$color_code}){ - my ($min_links,$max_links)=split(",",$color_code->{$col}); - $hcolor{$col}=[$min_links,$max_links]; - } - - open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; - open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; - - my $index=1; - while(){ - - my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; - - my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links - - print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output - "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; - $index++; - } - - close LINKS; - close SEGDUP; - print LOG "-- output created: $segdup_file\n"; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the bedPE format for BEDTools usage -sub links2bedPElinksfile{ - - my ($sample,$links_file,$bedpe_file)=@_; - - open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; - open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; - - my $nb_links=1; - - while(){ - - chomp; - my @t=split("\t",$_); - my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); - my $type=($chr1 eq $chr2)? "INTRA":"INTER"; - $type.="_".$t[10]; - - $start1--; $start2--; - - print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". - "\t$sample"."_link$nb_links\t$type\t.\t.". - "\t".join("|",@t)."\n"; - - $nb_links++; - } - - close LINKS; - close BEDPE; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub bedPElinks2linksfile{ - - my ($bedpe_file,$links_file)=@_; - - open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; - open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; - - while(){ - - chomp; - my $sample=(split("_",(split("\t",$_))[6]))[0]; - my @t1=(split("\t",$_))[0,1,2,3,4,5]; - my @t2=split(/\|/,(split("\t",$_))[10]); - push(@t2,$sample); - - print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; - - } - close BEDPE; - close LINKS; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#convert the links file to the bed format -sub links2bedfile{ - - my ($tag_length,$color_code,$links_file,$bed_file)=@_; - - print LOG "# Converting to the bed format...\n"; - - my $compare=1; - if($links_file!~/compared$/){ - $compare=0; - $tag_length->{none}->{1}=$tag_length->{1}; - $tag_length->{none}->{2}=$tag_length->{2}; - } - - #color-code hash table - tie (my %hcolor,'Tie::IxHash'); - my %color_order; - my $n=1; - foreach my $col (keys %{$color_code}){ - my ($min_links,$max_links)=split(",",$color_code->{$col}); - $hcolor{$col}=[$min_links,$max_links]; - $color_order{$col}=$n; - $n++; - } - - my %pair; - my %pt; - $n=1; - open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; - - my %str=( "F"=>"+", "R"=>"-" ); - - while(){ - - my @t=split; - my $sample=($compare)? pop(@t):"none"; - - my $chr1=$t[0]; - my $chr2=$t[3]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); - $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); - my $same_chr=($chr1 eq $chr2)? 1:0; - - my $count=$t[6]; - my $color=getColor($count,\%hcolor,"bed"); - - my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); - my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); - my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); - my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); - my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); - my @position1=deleteBadOrderSensePairs(split(",",$t[14])); - my @position2=deleteBadOrderSensePairs(split(",",$t[15])); - my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); - my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); - - - for my $p (0..$#pairs){ - - if (!exists $pair{$pairs[$p]}){ - - if($same_chr){ - - $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, - $start1[$p]-1, $end2[$p], $color, - 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; - $pt{$n}=$pair{$pairs[$p]}->{0}; - $n++; - - }else{ - - $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, - $start1[$p]-1, $end1[$p], $color, - 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; - $pt{$n}=$pair{$pairs[$p]}->{1}; - $n++; - - - $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, - $start2[$p]-1, $end2[$p], $color, - 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; - $pt{$n}=$pair{$pairs[$p]}->{2}; - $n++; - } - }else{ - - if($same_chr){ - ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); - }else{ - ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); - ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); - } - } - } - } - close LINKS; - - my $nb_pairs=$n-1; - - open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; - print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". - "visibility=2 itemRgb=\"On\"\n"; - - for my $i (1..$nb_pairs){ - print BED join("\t",@{$pt{$i}})."\n"; - } - - close BED; - - print LOG "-- output created: $bed_file\n"; - - undef %pair; - undef %pt; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub deleteBadOrderSensePairs{ - - my (@tab)=@_; - my @tab2; - - foreach my $v (@tab){ - - $v=~s/[]//g; - push(@tab2,$v) if($v!~/[\$\*\@]$/); - } - return @tab2; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getAllEntries{ - my (@tab)=split (/,/,$_[0]); - my @tab2; - - foreach my $v (@tab){ - - $v=~s/[]//g; - push(@tab2,$v); - } - return @tab2; -}#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getAllEntriesWOspecialChar{ - my (@tab)=split (/,/,$_[0]); - my @tab2; - - foreach my $v (@tab){ - - $v=~s/[\$\*\@]//g; - push(@tab2,$v); - } - return @tab2; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub links2SVfile{ - - my($links_file,$sv_file)=@_; - - print LOG "# Converting to the sv output table...\n"; - open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; - open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; - - my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist - chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering - final_score breakpoint1_start1-end1 breakpoint2_start2-end2); - - my $nb_links=0; - - while (){ - - my @t=split; - my @sv=(); - my $sv_type="-"; - my $strand_ratio="-"; - my $eq_ratio="-"; - my $eq_type="-"; - my $insert_ratio="-"; - my $link="-"; - my ($bk1, $bk2)=("-","-"); - my $score="-"; - - my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); - my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); - my $nb_pairs=$t[6]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); - $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); - my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; - - #if strand filtering - if (defined $t[16]){ - #if inter-chr link - $sv_type=$t[16]; - if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ - $strand_ratio=floor($1/$2*100)."%"; - $score=$t[18]; - } - if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ - #if intra-chr link with insert size filtering - $strand_ratio=floor($1/$2*100)."%"; - $link=floor($t[17]); - if($sv_type!~/^INV/){ - $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); - $score=$t[20]; - }else{ - $score=$t[19]; - } - } - } - - if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ - - #if strand and order filtering only and/or interchr link - $eq_type=$t[18]; - $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); - ($bk1, $bk2)=($t[20],$t[21]); - foreach my $bk ($bk1, $bk2){$bk=~s/\),$/ /g; $bk=~s/(\(|$)//g; $bk=~s/,/-/g;} - $score=$t[22]; - - }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ - - #if all three filtering - $link=floor($t[17]); - $eq_type=$t[19]; - $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); - - if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ - $insert_ratio=floor($1/$2*100)."%"; - ($bk1, $bk2)=($t[22],$t[23]); - $score=$t[24]; - - }else{ - ($bk1, $bk2)=($t[21],$t[22]); - $score=$t[23]; - } - foreach my $bk ($bk1, $bk2){$bk=~s/\),$/ /g; $bk=~s/(\(|$)//g; $bk=~s/,/-/g;} - - } - - - push(@sv, $chr_type, $sv_type,$eq_type); - push(@sv,"$chr1\t$start1-$end1"); - push(@sv, $link); - push(@sv,"$chr2\t$start2-$end2", - $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); - - - print SV join("\t",@sv)."\n"; - } - - close LINKS; - close SV; - - system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; - - open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; - my @links=; - close SV; - - open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; - - print SV join("\t",@header)."\n"; - print SV @links; - close SV; - - unlink($sv_file.".sorted"); - - print LOG "-- output created: $sv_file\n"; - -} -#------------------------------------------------------------------------------# -sub densityCalculation{ - - my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_; - - my @sfile=split(/\./,$$mates_file[$file]); - my $fchr=$sfile[$#sfile]; - - my $fh = new FileHandle; - - my %density; - my %density_ref; - my @ratio; - my ($cov,$cov_ref); - - #FREQUENCY CALCULATION PROCEDURE - print LOG "# $fchr : Frequency calculation procedure...\n"; - &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format); - &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format); - - #MAKING RATIO AND OUTPUT - print LOG "\# Ratio calculation procedure...\n"; - $density_file=~s/\/mates\//\/density\//; - $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n"; - - foreach my $k (1..$chr->{nb_chrs}){ - foreach my $frag (1..$chr->{$k}->{nb_frag}){ - - @ratio= ($chr->{$k}->{name}, - (${$chr->{$k}->{$frag}}[0]+1), - (${$chr->{$k}->{$frag}}[1]+1)); - - $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0; - $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0; - - push(@ratio,$cov,$cov_ref); - push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref); - push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref); - push(@ratio,log($cov+1)) if($cov && !$cov_ref); - next if(!$cov && !$cov_ref); - - print $fh join("\t",@ratio)."\n"; - } - } - - $fh->close; - print LOG "-- output created: $density_file\n"; - - undef %density; - undef %density_ref; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub FreqCalculation{ - - my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_; - - my @sfile=split(/\./,$mates_file); - my $fchr=$sfile[$#sfile]; - my $fh = new FileHandle; - - my $nb_windows=0; - my $warn=100000; - my $record=0; - my %pair; - - my ($sumX,$sumX2) = (0,0); - - print LOG "\# Frequency calculation for $mates_file...\n"; - - if ($mates_file =~ /.gz$/) { - $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; - }elsif($mates_file =~ /.bam$/){ - o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY - }else{ - $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; - } - - while(<$fh>){ - - my @t=split; - my $mate=$t[0]; - - my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,); - - next if(exists $pair{$mate}); - - next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); - - next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2}); - ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); - - $pair{$mate}=undef; - $record++; - - my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2); - - recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); - recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); - - my $length = abs($coord_start_read1-$coord_start_read2); - $sumX += $length; #add to sum and sum^2 for mean and variance calculation - $sumX2 += $length*$length; - - for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ - - if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ - - &addToDensity($density,$chr_read1,$i,\$nb_windows) - if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])); - - }else{ - - $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); - } - } - - if($record>=$warn){ - print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n"; - $warn+=100000; - } - } - $fh->close; - - print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n"; - - if($record>0){ - - my $mu = $sumX/$record; - my $sigma = sqrt($sumX2/$record - $mu*$mu); - print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n"; - } - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub ratio2segdup{ - - my($id,$density_file,$segdup_file)=@_; - - print LOG "# Converting to circos format...\n"; - - open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; - open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; - - while(){ - chomp; - my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; - print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n"; - } - - close RATIO; - close SEGDUP; - print LOG "-- output created: $segdup_file\n"; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub ratio2bedfile{ - - my($density_file,$bed_file)=@_; - - print LOG "# Converting to bedGraph format...\n"; - - open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; - open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; - print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ". - "visibility=2 color=255,0,0 alwaysZero=\"On\"\n"; - - while(){ - chomp; - my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; - $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/); - print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n"; - } - - close RATIO; - close BED; - print LOG "-- output created: $bed_file\n"; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub inverseSense{ - - my $mate_sense=$_[0]; - my %reverse=( 'F' => 'R' , 'R' => 'F' , - 'FF' => 'RR', 'RR' => 'FF', - 'FR' => 'RF', 'RF' => 'FR'); - return $reverse{$mate_sense}; -} - -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getNextFrag{ - - my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_; - - my $how_far = $read_start-$frag_start; - my $nb_windows_toskip; - - if($how_far>0){ - - $nb_windows_toskip=($how_far/$step)-($window_dist/$step); - $nb_windows_toskip=~ s/\..*//; - $nb_windows_toskip=0 if($nb_windows_toskip<0); - return ($frag_num + $nb_windows_toskip); - } - return $frag_last; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getColor{ - - my($count,$hcolor,$format)=@_; - for my $col ( keys % { $hcolor} ) { - return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); - } - return "white" if($format eq "circos"); - return "255,255,255" if($format eq "bed"); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub recupCoords{ - - my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_; - my $strand = 'F'; - - if ($c_hit=~s/^\-//) { - $strand='R'; - $$cs_hit=$c_hit; - $$ce_hit=$c_hit-($tag_length-1); - }else{ - $$cs_hit=$c_hit; - $$ce_hit=$c_hit+($tag_length-1); - } - return $strand; - -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub overlap { - my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_; - if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) { - return 0; - } - return 1; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub makeLink { - - my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_; - - if($chr1>$chr2){ - ($chr1,$chr2)= ($chr2,$chr1); - ($frag1,$frag2)= ($frag2,$frag1); - } - - if($chr1 == $chr2){ - if($frag1>$frag2){ - ($frag1,$frag2)= ($frag2,$frag1); - } - } - - if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){ - $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt; - $$nb++; - }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){ - $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt"; - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#fonction of adding the read to the density profile -sub addToDensity { - - my ($density,$chr1,$frag1,$nb)=@_; - - if(!exists $density->{$chr1}->{$frag1}->{count}){ - $density->{$chr1}->{$frag1}->{count}=1; - $$nb++; - }else{ - $density->{$chr1}->{$frag1}->{count}++; - } -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub floor { - my $nb = $_[0]; - $nb=~ s/\..*//; - return $nb; -} -#------------------------------------------------------------------------------# -sub decimal{ - - my $num=shift; - my $digs_to_cut=shift; - - $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); - - return $num; -} - -#------------------------------------------------------------------------------# -sub max { - - my($max) = shift(@_); - foreach my $temp (@_) { - $max = $temp if $temp > $max; - } - return($max); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub min { - - my($min) = shift(@_); - foreach my $temp (@_) { - $min = $temp if $temp < $min; - } - return($min); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub sortTablebyIndex{ - my ($tab1,$tab2)=@_; - my @tab3; - - foreach my $i (@$tab1){ - $tab3[$i]=$$tab2[$$tab1[$i]]; - } - return @tab3; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub round { - my $number = shift || 0; - my $dec = 10 ** (shift || 0); - return int( $dec * $number + .5 * ($number <=> 0)) / $dec; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub getUniqueTable{ - - my (@tab)=@_; - my (%saw,@out)=(); - undef %saw; - return sort(grep(!$saw{$_}++, @tab)); -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -sub catFiles { - - unlink("$_[1]") if(exists $_[1]); - system qq( cat "$_" >> "$_[1]" ) for @{$_[0]}; -} -#------------------------------------------------------------------------------# -#------------------------------------------------------------------------------# -#check if the configuration file is correct -sub validateconfiguration{ - - my %conf=%{$_[0]}; - my $list_prgs="@ARGV"; - - my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file); - my @detection_params=qw(split_mate_file window_size step_length split_mate_file); - my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file); - my @circos_params=qw(organism_id colorcode); - my @bed_params=qw(colorcode); - my @compare_params=qw(list_samples file_suffix); - - foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){ - - unless (defined($dir)) { - $dir = "."; - } - unless (-d $dir){ - mkdir $dir or die; - } - $dir.="/" if($dir!~/\/$/); - } - - unless (defined($conf{general}{num_threads})) { - $conf{general}{num_threads} = 1; - } - $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24); - - if($list_prgs!~/links2compare/){ - - foreach my $p (@general_params){ - die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p}); - } - - $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam"); - - unless (defined($conf{general}{sv_type})) { - $conf{general}{sv_type} = "all"; - } - - $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}}; - } - - if($list_prgs=~/(linking|cnv)/){ - - foreach my $p (@detection_params){ - die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p}); - } - - die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref}); - - if($conf{detection}{step_length}>$conf{detection}{window_size}){ - die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n"); - } - - unless (-d $conf{general}{tmp_dir}."/mates"){ - mkdir $conf{general}{tmp_dir}."/mates" or die; - } - - if($list_prgs=~/linking/){ - unless (-d $conf{general}{tmp_dir}."/links"){ - mkdir $conf{general}{tmp_dir}."/links" or die; - } - } - if($list_prgs=~/cnv/){ - unless (-d $conf{general}{tmp_dir}."/density"){ - mkdir $conf{general}{tmp_dir}."/density" or die; - } - } - - } - - if($list_prgs=~/filtering/){ - - foreach my $p (@filtering_params) { - die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p}); - - } - - if(defined($conf{filtering}{chromosomes})) { - my @chrs=split(",",$conf{filtering}{chromosomes}); - my $exclude=($chrs[0]=~/^\-/)? 1:0; - for my $chrName (@chrs){ - - die("Error Config : The filtering parameter \"chromosomes\" is not valid\n") - if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude)); - - } - } - - if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) { - die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected". - "\nChange strand_filtering to \"1\" if you want to use the order filtering\n"); - } - if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) { - die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n"); - } - if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) { - die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected". - "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n"); - } - if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) { - die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n"); - } - - if (!defined($conf{filtering}{indel_sigma_threshold})) { - $conf{filtering}{indel_sigma_threshold} = 2; - } - if (!defined($conf{filtering}{dup_sigma_threshold})) { - $conf{filtering}{dup_sigma_threshold} = 2; - } - if (!defined($conf{filtering}{singleton_sigma_threshold})) { - $conf{filtering}{singleton_sigma_threshold} = 4; - } - - if (!defined($conf{filtering}{nb_pairs_order_threshold})) { - $conf{filtering}{nb_pairs_order_threshold} = 1; - } - - if (!defined($conf{filtering}{final_score_threshold})) { - $conf{filtering}{final_score_threshold} = 0.8; - } - - if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) { - die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n"); - } - - } - - if($list_prgs=~/2circos$/){ - foreach my $p (@circos_params) { - next if($list_prgs=~/^ratio/ && $p eq "colorcode"); - die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); - } - } - - if($list_prgs=~/2bed$/){ - foreach my $p (@bed_params) { - die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); - } - } - - if($list_prgs=~/links2compare/){ - foreach my $p (@compare_params) { - die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); - } - - unless (defined($conf{compare}{same_sv_type})) { - $conf{compare}{same_sv_type} = 0; - } - - unless (defined($conf{compare}{min_overlap})) { - $conf{compare}{min_overlap} = 1E-9; - } - - if($conf{compare}{circos_output}){ - foreach my $p (@circos_params) { - next if($list_prgs=~/^ratio/ && $p eq "colorcode"); - die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); - } - } - if($conf{compare}{bed_output}){ - foreach my $p (@bed_params) { - die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); - } - die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); - - my @samples=split(",",$conf{compare}{list_samples}); - my @read_lengths=split(",",$conf{compare}{list_read_lengths}); - for my $i (0..$#samples){ - my @l=split("-",$read_lengths[$i]); - $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; - } - } - } - - -} -#------------------------------------------------------------------------------# -#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/SVDetect_run_parallel.xml --- a/svdetect/SVDetect_run_parallel.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,324 +0,0 @@ - - -and identify structural variants - -SVDetect_run_parallel.pl - -#if $getLinks.linking == "linking" -linking - -#end if -#if $getFilteredLinks.filtering == "filtering" -filtering - -#if str($getFilteredLinks.links2SV) == "create" -links2SV --out3 '$sv_file' -#end if -#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2circos) == "create" -links2circos --out4 '$circos_file' -#end if -#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2bed) == "create" -links2bed --out5 '$bed_file' -#end if -#end if --conf '$config_file' --l '$log_file' --N '$sample_name' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ( - getFilteredLinks['filtering']=="filtering" and - getFilteredLinks['links2SV'] is True - ) - - - - ( - getFilteredLinks['filtering']=="filtering" and - getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and - getFilteredLinks['file_conversion']['links2circos'] is True - ) - - - - ( - getFilteredLinks['filtering']=="filtering" and - getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and - getFilteredLinks['file_conversion']['links2bed'] is True - ) - - - - - - - - - -<general> -input_format = bam -sv_type = ${sv_type} -mates_orientation=${mates_orientation} -read1_length=${read1_length} -read2_length=${read2_length} -mates_file=${mates_file} -cmap_file=${cmap_file} -tmp_dir=$__new_file_path__/svdetect/tmp -output_dir=$__new_file_path__/svdetect -num_threads=8 -</general> - -#if $getLinks.linking == "linking" -<detection> -#if str($getLinks.splitmate) == "split" -split_mate_file=1 -#else -split_mate_file=0 -#end if -window_size=${getLinks.window_size} -step_length=${getLinks.step_length} -</detection> -#end if - -#if $getFilteredLinks.filtering == "filtering" -<filtering> -#if str($getFilteredLinks.splitlink) == "split" -split_link_file=1 -#else -split_link_file=0 -#end if -#if str($getFilteredLinks.chromosomes) != "" -chromosomes=${getFilteredLinks.chromosomes} -#end if -nb_pairs_threshold=${getFilteredLinks.nb_pairs_threshold} -#if $getFilteredLinks.filter1.strand_filtering == "strand" -strand_filtering=1 -final_score_threshold=${getFilteredLinks.filter1.final_score_threshold} -#if $getFilteredLinks.filter1.filter2.order_filtering == "order" -order_filtering=1 -mu_length=${getFilteredLinks.filter1.filter2.mu_length} -sigma_length=${getFilteredLinks.filter1.filter2.sigma_length} -nb_pairs_order_threshold=${getFilteredLinks.filter1.filter2.nb_pairs_order_threshold} -#if $getFilteredLinks.filter1.filter2.filter3.insert_size_filtering == "insert" -insert_size_filtering=1 -indel_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.indel_sigma_threshold} -dup_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.dup_sigma_threshold} -singleton_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.singleton_sigma_threshold} -#else -insert_size_filtering=0 -#end if -#else -order_filtering=0 -#end if -#else -strand_filtering=0 -#end if -</filtering> -#end if - -#if $getFilteredLinks.filtering == "filtering" -#if $getFilteredLinks.file_conversion.file_conversion_select == "convert" -#if str($getFilteredLinks.file_conversion.links2circos) == "create" -<circos> -organism_id=${getFilteredLinks.file_conversion.organism_id} -<colorcode> -#for $color_repeat in $getFilteredLinks.file_conversion.color_code -${color_repeat.color}=${color_repeat.interval} -#end for -</colorcode> -</circos> -#end if -#if str($getFilteredLinks.file_conversion.links2bed) == "create" -<bed> -<colorcode> -#for $color_repeat in $getFilteredLinks.file_conversion.color_code -#if str($color_repeat.color)== "grey" -190,190,190=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "black" -0,0,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "blue" -0,0,255=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "green" -0,255,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "purple" -153,50,205=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "orange" -255,140,0=${color_repeat.interval} -#end if -#if str($color_repeat.color)== "red" -255,0,0=${color_repeat.interval} -#end if -#end for -</colorcode> -</bed> -#end if -#end if -#end if - - - - -**What it does** - -SVDetect - Version : 0.8b - -Parallel version (nCPU=8) - -SVDetect is a application for the isolation and the type prediction of intra- and inter-chromosomal rearrangements from paired-end/mate-pair sequencing data provided by the high-throughput sequencing technologies - -This tool aims to identifying structural variations (SVs) with both clustering and sliding-window strategies, and helping in their visualization at the genome scale. -SVDetect is compatible with SOLiD and Illumina (>=1.3) reads. - -Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 svdetect/circos_graph.xml --- a/svdetect/circos_graph.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,290 +0,0 @@ - - -plots - -circos/bin/circos - --conf '$circos_config_file' --outputfile '${outputfile}.dat' --png - -> '$log_file' - -; - -rm '$outputfile'; ln -s '${outputfile}.png' '$outputfile' - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -<ideogram> - -<spacing> - -default = 5u -break = 1u - -axis_break_at_edge = yes -axis_break = yes -axis_break_style = 2 - -<break_style 1> -stroke_color = black -fill_color = blue -thickness = 0.25r -stroke_thickness = 2 -</break> - -<break_style 2> -stroke_color = black -stroke_thickness = 3p -thickness = 1.5r -</break> - -</spacing> - -## thickness (px) of chromosome ideogram -thickness = 100p -stroke_thickness = 2 -## ideogram border color -stroke_color = black -fill = yes -## the default chromosome color is set here and any value -## defined in the karyotype file overrides it -fill_color = black - -## fractional radius position of chromosome ideogram within image -radius = 0.85r -show_label = yes -label_with_tag = yes -label_font = condensedbold -label_radius = dims(ideogram,radius) + 0.075r -label_size = 60p - -## cytogenetic bands -band_stroke_thickness = 2 - -## show_bands determines whether the outline of cytogenetic bands -## will be seen -show_bands = yes -## in order to fill the bands with the color defined in the karyotype -## file you must set fill_bands -fill_bands = yes - -</ideogram> - - - - - -show_ticks = yes -show_tick_labels = yes - -<ticks> -radius = dims(ideogram,radius_outer) -multiplier = 1e-6 - -<tick> -spacing = 0.5u -size = 2p -thickness = 2p -color = grey -show_label = no -label_size = 12p -label_offset = 0p -format = %.2f -</tick> - -<tick> -spacing = 1u -size = 3p -thickness = 2p -color = dgrey -show_label = no -label_size = 12p -label_offset = 0p -format = %.2f -</tick> - -<tick> -spacing = 5u -size = 5p -thickness = 2p -color = black -show_label = yes -label_size = 16p -label_offset = 0p -format = %d -</tick> - -<tick> -spacing = 10u -size = 8p -thickness = 2p -color = black -show_label = yes -label_size = 20p -label_offset = 5p -format = %d -</tick> -</ticks> - - - - -<colors> -<<include etc/colors.conf>> -</colors> - -<fonts> -<<include etc/fonts.conf>> -</fonts> - -<<include $ideogram_config_file>> -<<include $ticks_config_file>> - -karyotype = $karyotype - -<image> -24bit = yes -##png = yes -##svg = no -## radius of inscribed circle in image -radius = 1500p -background = white -## by default angle=0 is at 3 o'clock position -angle_offset = -90 -#angle_orientation = counterclockwise - -auto_alpha_colors = yes -auto_alpha_steps = 5 -</image> - -chromosomes_units= $chromosomes_units - -#if str($chromosomes)=="" -chromosomes_display_default = yes -#else -chromosomes_display_default = no -chromosomes = $chromosomes -#end if - -<links> - -z = 0 -radius = 0.95r -bezier_radius = 0.2r - -<link segdup> -show = yes -color = dgrey_a5 -thickness = 2 -file = $link_file -record_limit = 1000 -</link> - -</links> - - -anglestep = 0.5 -minslicestep = 10 -beziersamples = 40 -debug = no -warnings = no -imagemap = no - -units_ok = bupr -units_nounit = n - - - - -**What it does** - -Circos - -Manual documentation available at the http://circos.ca/ - - -**Example of link segdup file** - -segdup file:: - - 1 hs1 1077096 1078746 color=red - 1 hs1 1080923 1082805 color=red - 2 hs1 1137684 1137961 color=red - 2 hs3 1138138 1138423 color=red - 3 hs11 1169417 1170000 color=red - 3 hs11 1170025 1170975 color=red - 4 hs11 1222480 1224271 color=green - 4 hs11 1223328 1225675 color=green - 5 hs12 1223336 1225812 color=grey - 5 hs13 1224709 1227633 color=grey - 6 hs11 1223621 1226460 color=red - 6 hs11 1224918 1227633 color=red - 7 hs11 1399510 1401513 color=white - 7 hs11 1401628 1403697 color=white - 8 hs15 1652045 1653746 color=red - 8 hs15 1657167 1658940 color=red - 9 hs11 165333 165887 color=white - 9 hs11 165981 168016 color=white - 10 hs11 1702700 1702841 color=red - 10 hs11 1702903 1703057 color=red - 11 hs11 1912272 1915186 color=white - 11 hs11 1937111 1939824 color=white - 12 hs11 1983211 1983355 color=red - 12 hs11 1983591 1983748 color=red - 13 hs11 2913657 2913898 color=white - 13 hs11 2914048 2914341 color=white - 14 hs11 3090593 3090749 color=purple - 14 hs11 3090709 3090864 color=purple - 15 hs21 3466365 3466434 color=red - 15 hs21 3466554 3466620 color=red - 16 hsX 3603073 3603321 color=white - 16 hsX 3603295 3603520 color=white - - - ------ - -.. class:: infomark - -Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of Circos. - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 varscan/tool_dependencies.xml --- a/varscan/tool_dependencies.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ - - - - - - http://downloads.sourceforge.net/project/varscan/VarScan.v2.3.5.jar - - VarScan.v2.3.5.jar - $INSTALL_DIR/jars - - - $INSTALL_DIR/jars - - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 varscan/varscan_mpileup.pl --- a/varscan/varscan_mpileup.pl Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,116 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Cwd; - -die qq( -Bad numbr of inputs - -) if(!@ARGV); - -my $options =""; -my $file=""; -my $command=""; -my $output=""; -my $working_dir = cwd(); -my $temp_vcf = "$working_dir/temp"; -my $log=""; - -foreach my $input (@ARGV) -{ - my @tmp = split "::", $input; - if($tmp[0] eq "COMMAND") - { - $command = $tmp[1]; - } - elsif($tmp[0] eq "INPUT") - { - $file = $tmp[1]; - } - elsif($tmp[0] eq "OPTION") - { - $options = "$options ${tmp[1]}"; - } - elsif($tmp[0] eq "OUTPUT") - { - $output = $tmp[1]; - } - elsif($tmp[0] eq "LOG") - { - $log = $tmp[1]; - } - else - { - die("Unknown Input: $input\n"); - } -} - -system ("$command $file $options 1>$temp_vcf 2>$log"); - -vs2vcf($temp_vcf, $output); - - -sub vs2vcf -{ - - # - # G l o b a l v a r i a b l e s - # - my $version = '0.1'; - - # - # Read in file - # - my $input = shift; - my $output = shift; - my $chr_ord = shift; - open(IN, $input) or die "Can't open $input': $!\n"; - open(OUT, ">$output") or die "Can't create $output': $!\n"; - my %output; - - while ( ) - { - if ( /^#/ ) - { - print OUT; - next; - } - chomp; - my $line = $_; - - my @flds = split ( "\t", $line ); - my $ref = $flds[3]; - my $alt = $flds[4]; - # - # Deletion of bases - # - if ( $alt =~ /^\-/ ) - { - ($flds[3], $flds[4]) = ($ref.substr($alt,1), $ref); - } - - # - # Insertion of bases - # - if ( $alt =~ /^\+/ ) - { - $flds[4] = $ref.substr($alt,1); - } - print OUT join( "\t", @flds),"\n" unless defined $chr_ord; - $output{$flds[0]}{$flds[1]} = join( "\t", @flds)."\n" if defined $chr_ord; - } - close(IN); - # if chromosome order given return in sorted order - if(defined $chr_ord) - { - for my $chrom (@{ $chr_ord }) - { - for my $pos (sort {$a<=>$b} keys %{ $output{$chrom} }) - { - print OUT $output{$chrom}{$pos}; - } - } - } - close(OUT); -} - diff -r e1ad03534fb2 -r 24d2dc34dd17 varscan/varscan_mpileup.xml --- a/varscan/varscan_mpileup.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,124 +0,0 @@ - - - mutation caller for targeted, exome, and whole-genome resequencing - - - VarScan - - - - varscan_mpileup.pl - "COMMAND::java -jar \$JAVA_JAR_PATH/VarScan.v2.3.5.jar $exe_command" - "INPUT::$in_file" - "OUTPUT::$output" - "LOG::$log" - "OPTION::--min-coverage $min_coverage" - "OPTION::--min-reads2 $min_reads2" - "OPTION::--min-avg-qual $min_avg_qual" - "OPTION::--min-var-freq $min_var_freq" - "OPTION::--min-freq-for-hom $min_freq_for_hom" - "OPTION::--p-value $p_value" - "OPTION::--strand-filter $strand_filter" - "OPTION::--output-vcf 1" - - #if ($vcf_sample_list): - "OPTION::--vcf-sample-list $vcf_sample_list" - #end if - "OPTION::--variants $variants" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -:: - - VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. It can be used to detect different types of variation: - - Germline variants (SNPs an dindels) in individual samples or pools of samples. - Multi-sample variants (shared or private) in multi-sample datasets (with mpileup). - Somatic mutations, LOH events, and germline variants in tumor-normal pairs. - Somatic copy number alterations (CNAs) in tumor-normal exome data. - - -**Input** - -:: - - mpileup file - The SAMtools mpileup file - - -**Parameters** - -:: - - commands - mpileup2snp Identify SNPs from an mpileup file - mpileup2indel Identify indels an mpileup file - mpileup2cns Call consensus and variants from an mpileup file - - min-coverage - Minimum read depth at a position to make a call [8] - - min-reads2 - Minimum supporting reads at a position to call variants [2] - - min-avg-qual - Minimum base quality at a position to count a read [15] - - min-var-freq - Minimum variant allele frequency threshold [0.01] - - min-freq-for-hom - Minimum frequency to call homozygote [0.75] - - p-value - Default p-value threshold for calling variants [99e-02] - - strand-filter - Ignore variants with >90% support on one strand [1] - - output-vcf - If set to 1, outputs in VCF format - - vcf-sample-list - For VCF output, a list of sample names in order, one per line - - variants - Report only variant (SNP/indel) positions [0] - - - - - - diff -r e1ad03534fb2 -r 24d2dc34dd17 varscan/varscan_somatic.pl --- a/varscan/varscan_somatic.pl Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,138 +0,0 @@ -#!/usr/bin/perl - - -use strict; -use Cwd; - -die qq( -Bad numbr of inputs - -) if(!@ARGV); - -my $options =""; -my $normal=""; -my $command=""; -my $tumor=""; -my $output=""; -my $working_dir = cwd(); -my $snp = "$working_dir/output.snp.vcf"; -my $indels = "$working_dir/output.indel.vcf"; - -foreach my $input (@ARGV) -{ - my @tmp = split "::", $input; - if($tmp[0] eq "COMMAND") - { - $command = $tmp[1]; - } - if($tmp[0] eq "NORMAL") - { - $normal = $tmp[1]; - } - elsif($tmp[0] eq "TUMOR") - { - $tumor = $tmp[1]; - } - elsif($tmp[0] eq "OPTION") - { - $options = "$options ${tmp[1]}"; - } - elsif($tmp[0] eq "OUTPUT") - { - $output = $tmp[1]; - } - - else - { - die("Unknown Input: $input\n"); - } -} - -system ("$command $normal $tumor $options "); -system("grep -v '^\#' $indels | grep -v '^chrom position' >> $snp"); - -my @chr_ord = chromosome_order($tumor); - -vs2vcf($snp, $output,\@chr_ord); - - -sub vs2vcf -{ - - # - # G l o b a l v a r i a b l e s - # - my $version = '0.1'; - - # - # Read in file - # - my $input = shift; - my $output = shift; - my $chr_ord = shift; - open(IN, $input) or die "Can't open $input': $!\n"; - open(OUT, ">$output") or die "Can't create $output': $!\n"; - my %output; - - while ( ) - { - if ( /^#/ ) - { - print OUT; - next; - } - chomp; - my $line = $_; - - my @flds = split ( "\t", $line ); - my $ref = $flds[3]; - my $alt = $flds[4]; - # - # Deletion of bases - # - if ( $alt =~ /^\-/ ) - { - ($flds[3], $flds[4]) = ($ref.substr($alt,1), $ref); - } - - # - # Insertion of bases - # - if ( $alt =~ /^\+/ ) - { - $flds[4] = $ref.substr($alt,1); - } - print OUT join( "\t", @flds),"\n" unless defined $chr_ord; - $output{$flds[0]}{$flds[1]} = join( "\t", @flds)."\n" if defined $chr_ord; - } - close(IN); - # if chromosome order given return in sorted order - if(defined $chr_ord) - { - for my $chrom (@{ $chr_ord }) - { - for my $pos (sort {$a<=>$b} keys %{ $output{$chrom} }) - { - print OUT $output{$chrom}{$pos}; - } - } - } - close(OUT); -} - - -sub chromosome_order -{ - my $input = shift; - # calculate flagstats - my $COMM = "samtools view -H $input | grep '^\@SQ'"; - my @SQ = `$COMM`; - chomp @SQ; - for(my $i = 0; $i <= $#SQ; $i++) - { - $SQ[$i] =~ s/^\@SQ\tSN:(.*?)\tLN:\d+$/$1/; - } - return(@SQ); -} - - diff -r e1ad03534fb2 -r 24d2dc34dd17 varscan/varscan_somatic.xml --- a/varscan/varscan_somatic.xml Fri Sep 13 02:11:51 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,131 +0,0 @@ - - - somatic mutation caller for cancer genomics - - - VarScan - - - varscan_somatic.pl - "COMMAND::java -jar \$JAVA_JAR_PATH/VarScan.v2.3.5.jar somatic" - "NORMAL::$normal" - "TUMOR::$tumor" - "OUTPUT::$output" - - "OPTION::--min-coverage $min_coverage" - "OPTION::--min-coverage-normal $min_coverage_normal" - "OPTION::--min-coverage-tumor $min_coverage_tumor" - - "OPTION::--min-var-freq $min_var_freq" - "OPTION::--min-freq-for-hom $min_freq_for_hom" - - "OPTION::--normal-purity $normal_purity" - "OPTION::--tumor-purity $tumor_purity" - - "OPTION::--p-value $p_value" - "OPTION::--somatic-p-value $somatic_p_value" - - "OPTION::--strand-filter $strand_filter" - "OPTION::--validation $validation" - "OPTION::--output-vcf 1" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -:: - - VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. It can be used to detect different types of variation: - - Germline variants (SNPs an dindels) in individual samples or pools of samples. - Multi-sample variants (shared or private) in multi-sample datasets (with mpileup). - Somatic mutations, LOH events, and germline variants in tumor-normal pairs. - Somatic copy number alterations (CNAs) in tumor-normal exome data. - - -**Input** - -:: - - mpileup normal file - The SAMtools mpileup file for normal - mpileup tumor file - The SAMtools mpileup file for tumor - - -**Parameters** - -:: - - min-coverage - Minimum read depth at a position to make a call [8] - - min-coverage-normal - Minimum coverage in normal to call somatic [8] - - min-coverage-tumor - Minimum coverage in tumor to call somatic [6] - - min-var-freq - Minimum variant frequency to call a heterozygote [0.10] - - min-freq-for-hom - Minimum frequency to call homozygote [0.75] - - normal-purity - Estimated purity (non-tumor content) of normal sample [1.00] - - tumor-purity - Estimated purity (tumor content) of tumor sample [1.00] - - p-value - Default p-value threshold for calling variants [0.99] - - somatic-p-value - P-value threshold to call a somatic site [0.05] - - strand-filter - If set to 1, removes variants with >90% strand bias - - validation - If set to 1, outputs all compared positions even if non-variant - - output-vcf - If set to 1, outputs in VCF format [Default] - - - - - -