Mercurial > repos > scisjnu123 > ngsap_vc
changeset 3:0d10255b5434 draft default tip
Uploaded
line wrap: on
 line diff
--- a/GATK/gatk/analyze_covariates.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -<macros> - <xml name="AnalyzeCovariatesParameters" tokens="tag"> - - <!-- BQSR in main config --> - - <param name="afterReportFile" type="data" format="tabular" optional="true" label="file containing the BQSR second-pass report file" help="-after,‑‑afterReportFile &lt;afterReportFile&gt;" /> - - <param name="beforeReportFile" type="data" format="tabular" optional="true" label="file containing the BQSR first-pass report file" help="-before,‑‑beforeReportFile &lt;beforeReportFile&gt;" /> - - </xml> - - <xml name="AnalyzeCovariatesOutput"> - <data format="pdf" name="ac_plotsReportFile" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (PDF Recalibration Report)"> - <yield /> - </data> - </xml> - - <template name="AnalyzeCovariatesPreprocessing"> -<![CDATA[ -]]> - </template> - - <template name="AnalyzeCovariatesOptions"> -<![CDATA[ - --plotsReportFile ${ac_plotsReportFile} - - #if str($analysis_type.afterReportFile) - --afterReportFile $analysis_type.afterReportFile - #end if - #if str($analysis_type.beforeReportFile) - --beforeReportFile $analysis_type.beforeReportFile - #end if -]]> - </template> -</macros> - -
--- a/GATK/gatk/base_recalibrator.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -<macros> - <xml name="BaseRecalibratorParameters" tokens="tag"> - - <expand macro="macro_bam_input" tag="@TAG@" /> - - <param name="knownSites" type="data" format="vcf,bcf,bed,pileup,tabular,table" label="Database of known Sites (ROD files; e.g. VCF format)" multiple="true" title="A database of known polymorphic sites to skip over in the recalibration algorithm" help="-knownSites,‑‑knownSites &lt;knownSites&gt;" /> - - </xml> - - <xml name="BaseRecalibratorOutput"> - <data format="tabular" name="br_table" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (Table)"> - <yield /> - </data> - </xml> - - <template name="BaseRecalibratorPreprocessing"> -<![CDATA[ - @token_bam_input_pre@ - #for $i, $variant in enumerate($analysis_type.knownSites): - ln -s -f ${variant} variant_${i}.vcf && - #end for -]]> - </template> - - <template name="BaseRecalibratorOptions"> -<![CDATA[ - --out ${br_table} - - @token_bam_input@ - - #for $i, $variant in enumerate($analysis_type.knownSites): - --knownSites variant_${i}.vcf - #end for -]]> - </template> -</macros> - -
--- a/GATK/gatk/combine_gvcfs.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -<macros> - <xml name="CombineGVCFsParameters" tokens="tag"> - - <expand macro="macro_gvcf_input" tag="@TAG@" /> - - <expand macro="macro_optional_parameters"> - - <param name="breakBandsAtMultiplesOf" type="integer" value="0" label="If > 0, reference bands will be broken up at genomic positions that are multiples of this number" help="-breakBandsAtMultiplesOf,‑‑breakBandsAtMultiplesOf &lt;breakBandsAtMultiplesOf&gt;" /> - - </expand> - - </xml> - - <xml name="CombineGVCFsOutput"> - <data format="vcf" name="cg_output_vcf" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (VCF)"> - <yield /> - </data> - </xml> - - <template name="CombineGVCFsPreprocessing"> -<![CDATA[ - @token_gvcf_input_pre@ -]]> - </template> - - <template name="CombineGVCFsOptions"> -<![CDATA[ - --out ${cg_output_vcf} - - @token_gvcf_input@ - - #set $optionals = $analysis_type.optional_parameters - #if $optionals.optional_parameters_enabled - #if $optionals.breakBandsAtMultiplesOf > 0 - --breakBandsAtMultiplesOf $optionals.breakBandsAtMultiplesOf - #end if - #end if -]]> - </template> - - -</macros> - -
--- a/GATK/gatk/combine_variants.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -<macros> - <xml name="CombineVariantsParameters" tokens="tag"> - - <expand macro="macro_vcf_input" tag="@TAG@" /> - - <expand macro="macro_optional_parameters"> - - <param name="filteredRecordsMergeType" type="select" label="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields" help="-filteredRecordsMergeType,‑‑filteredrecordsmergetype &lt;filteredrecordsmergetype&gt;"> - <option value="">No Selection</option> - <option value="KEEP_IF_ANY_UNFILTERED">KEEP_IF_ANY_UNFILTERED</option> - <option value="KEEP_IF_ALL_UNFILTERED">KEEP_IF_ALL_UNFILTERED</option> - <option value="KEEP_UNCONDITIONAL">KEEP_UNCONDITIONAL</option> - </param> - - <param name="genotypeMergeOptions" type="select" label="Determines how we should merge genotype records for samples shared across the ROD files" help="-genotypeMergeOptions,‑‑genotypemergeoption &lt;genotypemergeoption&gt;"> - <option value="">No Selection</option> - <option value="UNIQUIFY">UNIQUIFY</option> - <option value="PRIORITIZE">PRIORITIZE</option> - <option value="UNSORTED">UNSORTED</option> - <option value="REQUIRE_UNIQUE">REQUIRE_UNIQUE</option> - </param> - - <param name="minimumN" type="integer" value="1" optional="true" label="Combine variants and output site only if the variant is present in at least N input files" help="-minN,‑‑minimumN &lt;minimumN&gt;" /> - - <param name="rod_priority_list" type="text" value="" optional="true" label="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted" help="-priority,‑‑rod_priority_list &lt;rod_priority_list&gt;" /> - - <param name="setKey" type="text" value="" optional="true" label="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from" help="-setKey,‑‑setKey &lt;setKey&gt;" /> - - <param name="assumeIdenticalSamples" type="boolean" truevalue="--assumeIdenticalSamples" falsevalue="" label="If true, assume input VCFs have identical sample sets and disjoint calls" help="-assumeIdenticalSamples,‑‑assumeIdenticalSamples" /> - - <param name="excludeNonVariants" type="boolean" truevalue="--excludeNonVariants" falsevalue="" label="Don't include loci found to be non-variant after the combining procedure" help="-env,‑‑excludeNonVariants" /> - - <param name="filteredAreUncalled" type="boolean" truevalue="--filteredAreUncalled" falsevalue="" label="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF" help="-filteredAreUncalled,‑‑filteredAreUncalled" /> - - <param name="mergeInfoWithMaxAC" type="boolean" truevalue="--mergeInfoWithMaxAC" falsevalue="" label="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records." help="-mergeInfoWithMaxAC,‑‑mergeInfoWithMaxAC" /> - - <param name="minimalVCF" type="boolean" truevalue="--minimalVCF" falsevalue="" label="If true, then the output VCF will contain no INFO or genotype FORMAT fields" help="-minimalVCF,‑‑minimalVCF" /> - - <param name="printComplexMerges" type="boolean" truevalue="--printComplexMerges" falsevalue="" label="Print out interesting sites requiring complex compatibility merging" help="-printComplexMerges,‑‑printComplexMerges" /> - - <param name="suppressCommandLineHeader" type="boolean" truevalue="--suppressCommandLineHeader" falsevalue="" label="If true, do not output the header containing the command line used" help="-suppressCommandLineHeader,‑‑suppressCommandLineHeader" /> - - </expand> - - </xml> - - <xml name="CombineVariantsOutput"> - <data format="vcf" name="cv_output_vcf" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (VCF)"> - <yield /> - </data> - </xml> - - <template name="CombineVariantsPreprocessing"> -<![CDATA[ - @token_vcf_input_pre@ -]]> - </template> - - <template name="CombineVariantsOptions"> -<![CDATA[ - --out ${cv_output_vcf} - - @token_vcf_input@ - - #set $optionals = $analysis_type.optional_parameters - #if $optionals.optional_parameters_enabled - - #if $optionals.filteredRecordsMergeType - --filteredRecordsMergeType $optionals.filteredRecordsMergeType - #end if - #if $optionals.genotypeMergeOptions - --genotypeMergeOptions $optionals.genotypeMergeOptions - #end if - #if $optionals.minimumN != 1 - --minimumN $optionals.minimumN - #end if - #if $optionals.rod_priority_list - --rod_priority_list $optionals.rod_priority_list - #end if - - $optionals.assumeIdenticalSamples - $optionals.excludeNonVariants - $optionals.filteredAreUncalled - $optionals.mergeInfoWithMaxAC - $optionals.minimalVCF - $optionals.printComplexMerges - $optionals.suppressCommandLineHeader - - #end if -]]> - </template> - - -</macros> - -
--- a/GATK/gatk/gatk.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,179 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<tool id="gatk" name="GATK" version="@VERSION@.d9"> - <description>tool collection Version @VERSION@</description> - <macros> - <import>gatk_macros.xml</import> - <import>realigner_target_creator.xml</import> - <import>indel_realigner.xml</import> - <import>base_recalibrator.xml</import> - <import>analyze_covariates.xml</import> - <import>print_reads.xml</import> - <import>haplotype_caller.xml</import> - <import>genotype_gvcfs.xml</import> - <import>combine_gvcfs.xml</import> - <import>combine_variants.xml</import> - </macros> - <expand macro="requirements"/> - <stdio> - <regex match="^INFO" level="log"/> - <regex match="^WARN" level="warning"/> - <regex match="Using .* implementation of PairHMM" level="warning"/> - <regex match="There is insufficient memory for the Java Runtime Environment to continue" level="fatal"/> - <regex match="^##### ERROR" level="fatal"/> - <exit_code range="1:" level="fatal"/> - </stdio> - <command><![CDATA[ - ############################ - ## import analysis specific preprocessings by using cheetahs internal searchList - ## if not defined, ignore - ############################ - #if $analysis_type.analysis_type_selector + "Preprocessing" in vars()['SL'][2] - #set $analysisPreprocessing = vars()['SL'][2][$analysis_type.analysis_type_selector + "Preprocessing"] - #include source=$analysisPreprocessing - #end if - - ############################ - ## GATK tool unspecific options - ############################ - @GATK_EXEC@ - - --analysis_type ${analysis_type.analysis_type_selector} - --reference_sequence ${ref_file.fields.path} - - --log_to_file ${output_log} - - #if $cond_intervals.cond_intervals_enabled - #for $interval in $cond_intervals.intervals: - --intervals ${interval.L} - #end for - #end if - - #if $cond_BQSR.cond_BQSR_enabled - --BQSR $cond_BQSR.BQSR - #end if - - ############################ - ## import analysis specific options by using cheetahs internal searchList - ## if not defined throw raw python error until better idea - ############################ - #if $analysis_type.analysis_type_selector + "Options" in vars()['SL'][2] - #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] - #include source=$analysisOptions - #else - #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] - #end if - - ############################ - ## only put ERROR or FATAL log messages into stderr - ## but keep full log for printing into log file - ############################ - 2>&1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2 -]]></command> - <inputs> - <param name="ref_file" type="select" label="Using reference genome" help="-R,‑‑reference_sequence &lt;reference_sequence&gt;"> - <options from_data_table="picard_indexes"/> - <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> - </param> - <conditional name="cond_intervals"> - <param name="cond_intervals_enabled" type="boolean" label="Select interval subset to operate on?"/> - <when value="true"> - <repeat name="intervals" title="genomic interval over which to operate" help="-L,‑‑intervals &lt;intervals&gt;"> - <param name="L" type="text" value=""/> - </repeat> - </when> - <when value="false"/> - </conditional> - <conditional name="cond_BQSR"> - <param name="cond_BQSR_enabled" type="boolean" label="Select covariates for on-the-fly recalibration?"/> - <when value="true"> - <param name="BQSR" type="data" format="tabular" label="Input covariates table file for on-the-fly base quality score recalibration" help="-BQSR,‑‑BQSR &lt;BQSR&gt; intended primarily for use with BaseRecalibrator and PrintReads"/> - </when> - <when value="false"/> - </conditional> - <conditional name="cond_threads"> - <param name="cond_threads_enabled" type="boolean" label="Set computational options (cpu, mem)?"/> - <when value="true"> - <param name="nt" type="integer" value="1" label="Number of data threads to allocate to this analysis" help="make sure, the option is available for the chosen tool"/> - <param name="nct" type="integer" value="1" label="Number of CPU threads to allocate per data thread" help="make sure, the option is available for the chosen tool"/> - <param name="mem" type="integer" value="0" label="Overwrite Memory in MB (0 = don't overwrite)" help="Overwrites all other defaults and might lead to crash the run. States mem per data thread"/> - </when> - <when value="false"/> - </conditional> - <conditional name="analysis_type"> - <param name="analysis_type_selector" type="select" label="Analysis Type"> - <option value="RealignerTargetCreator">RealignerTargetCreator</option> - <option value="IndelRealigner">IndelRealigner</option> - <option value="BaseRecalibrator">BaseRecalibrator</option> - <option value="AnalyzeCovariates">AnalyzeCovariates</option> - <option value="PrintReads">PrintReads</option> - <option value="HaplotypeCaller">HaplotypeCaller</option> - <option value="GenotypeGVCFs">GenotypeGVCFs</option> - <option value="CombineGVCFs">CombineGVCFs</option> - <option value="CombineVariants">CombineVariants</option> - </param> - <when value="RealignerTargetCreator"> - <expand macro="RealignerTargetCreatorParameters" tag="rtc"/> - </when> - <when value="IndelRealigner"> - <expand macro="IndelRealignerParameters" tag="ir"/> - </when> - <when value="BaseRecalibrator"> - <expand macro="BaseRecalibratorParameters" tag="br"/> - </when> - <when value="AnalyzeCovariates"> - <expand macro="AnalyzeCovariatesParameters" tag="ac"/> - </when> - <when value="PrintReads"> - <expand macro="PrintReadsParameters" tag="pr"/> - </when> - <when value="HaplotypeCaller"> - <expand macro="HaplotypeCallerParameters" tag="hc"/> - </when> - <when value="GenotypeGVCFs"> - <expand macro="GenotypeGVCFsParameters" tag="gg"/> - </when> - <when value="CombineGVCFs"> - <expand macro="CombineGVCFsParameters" tag="cg"/> - </when> - <when value="CombineVariants"> - <expand macro="CombineVariantsParameters" tag="cv"/> - </when> - </conditional> - </inputs> - <outputs> - <expand macro="RealignerTargetCreatorOutput" tag="rtc"> - <filter>analysis_type['analysis_type_selector'] == 'RealignerTargetCreator'</filter> - </expand> - <expand macro="IndelRealignerOutput" tag="ir"> - <filter>analysis_type['analysis_type_selector'] == 'IndelRealigner'</filter> - </expand> - <expand macro="BaseRecalibratorOutput" tag="br"> - <filter>analysis_type['analysis_type_selector'] == 'BaseRecalibrator'</filter> - </expand> - <expand macro="AnalyzeCovariatesOutput" tag="ac"> - <filter>analysis_type['analysis_type_selector'] == 'AnalyzeCovariates'</filter> - </expand> - <expand macro="PrintReadsOutput" tag="pr"> - <filter>analysis_type['analysis_type_selector'] == 'PrintReads'</filter> - </expand> - <expand macro="HaplotypeCallerOutput" tag="hc"> - <filter>analysis_type['analysis_type_selector'] == 'HaplotypeCaller'</filter> - </expand> - <expand macro="GenotypeGVCFsOutput" tag="gg"> - <filter>analysis_type['analysis_type_selector'] == 'GenotypeGVCFs'</filter> - </expand> - <expand macro="CombineGVCFsOutput" tag="cg"> - <filter>analysis_type['analysis_type_selector'] == 'CombineGVCFs'</filter> - </expand> - <expand macro="CombineVariantsOutput" tag="cv"> - <filter>analysis_type['analysis_type_selector'] == 'CombineVariants'</filter> - </expand> - <data format="txt" name="output_log" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (log)"/> - </outputs> - <expand macro="macro_tests"/> - <citations> - <citation type="doi">10.1101/gr.107524.110</citation> - <citation type="doi">10.1038/ng.806</citation> - <citation type="doi">10.1002/0471250953.bi1110s43</citation> - </citations> -</tool>
--- a/GATK/gatk/gatk_macros.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,166 +0,0 @@ -<macros> - - <xml name="requirements"> - <requirements> - <requirement type="package">gatk</requirement> - <requirement type="set_environment">GATK_PATH</requirement> - <requirement type="set_environment">GATK_SITE_OPTIONS</requirement> - <requirement type="package" version="3.1.2.1">package_r_for_gatk_3_4_0</requirement> - </requirements> - </xml> - - <xml name="version_command"> - <version_command><![CDATA[ @GATK_EXEC@ --help|grep '^The Genome' ]]></version_command> - </xml> - - <token name="@VERSION@">3.4-0</token> - <token name="@OUTPUT_NAME_PREFIX@">${tool.name} - ${analysis_type.analysis_type_selector}</token> - <token name="@GATK_EXEC@"> -<![CDATA[ - #if $cond_threads.cond_threads_enabled: - #if int($cond_threads.nct) > 1: - THREAD_STRING="-nct $cond_threads.nct" && - #end if - #if int($cond_threads.nt) > 1: - THREAD_STRING=$THREAD_STRING" -nt $cond_threads.nt" && - #end if - #if int($cond_threads.mem) > 0: - GATK_MEM=$cond_threads.mem && - #end if - #end if - java -Xmx\${GATK_MEM:-\${SLURM_MEM_PER_NODE:-4096}}M -jar "\$GATK_PATH/GenomeAnalysisTK.jar" \${THREAD_STRING:-} -]]> - </token> - - <xml name="macro_vcf_input" tokens="tag"> - <param name="input" type="data" format="vcf" multiple="true" label="Variant files (VCF format)" help="-V, ‑‑variant"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_data_table" table_name="picard_indexes" metadata_name="dbkey" metadata_column="dbkey" message="Sequences are not currently available for the specified build." /> - </param> - </xml> - <token name="@token_vcf_input_pre@" tokens="tag"> -<![CDATA[ - ############################ - ## create links to gVCF input files with correct extensions - ############################ - #for $i, $variant in enumerate($analysis_type.input): - ln -s -f ${variant} variant_${i}.vcf && - #end for -]]> - </token> - <token name="@token_vcf_input@"> -<![CDATA[ - #for $i, $variant in enumerate($analysis_type.input): - --variant variant_${i}.vcf - #end for - @token_reference_input@ -]]> - </token> - - - <xml name="macro_gvcf_input" tokens="tag"> - <param name="input" type="data" format="vcf" multiple="true" label="Variant files (gVCF format)" help="-V, ‑‑variant"> - <validator type="unspecified_build" /> - <validator type="dataset_metadata_in_data_table" table_name="picard_indexes" metadata_name="dbkey" metadata_column="dbkey" message="Sequences are not currently available for the specified build." /> - </param> - </xml> - <token name="@token_gvcf_input_pre@" tokens="tag"> -<![CDATA[ - ############################ - ## create links to gVCF input files with correct extensions - ############################ - #for $i, $variant in enumerate($analysis_type.input): - ln -s -f ${variant} variant_${i}.g.vcf && - #end for -]]> - </token> - <token name="@token_gvcf_input@"> -<![CDATA[ - #for $i, $variant in enumerate($analysis_type.input): - --variant variant_${i}.g.vcf - #end for - @token_reference_input@ -]]> - </token> - - <xml name="macro_bam_input"> - <conditional name="cond_bam_input"> - <param name="all_in_one" type="boolean" value="false" label="Input all BAM files in a single command" /> - <when value="true"> - <param name="input" type="data" format="bam" multiple="true" label="Input file containing sequence data (BAM)" help="-I, ‑‑input_file"> - <validator type="unspecified_build"/> - <validator type="dataset_metadata_in_data_table" table_name="picard_indexes" metadata_name="dbkey" metadata_column="dbkey" message="Sequences are not currently available for the specified build."/> - </param> - </when> - <when value="false"> - <param name="input" type="data" format="bam" label="Input file containing sequence data (BAM)" help="-I, ‑‑input_file"> - <validator type="unspecified_build"/> - <validator type="dataset_metadata_in_data_table" table_name="picard_indexes" metadata_name="dbkey" metadata_column="dbkey" message="Sequences are not currently available for the specified build."/> - </param> - </when> - </conditional> - </xml> - <token name="@token_bam_input_pre@"> -<![CDATA[ - ############################ - ## create links to bam input files with correct extensions - ############################ - #if $analysis_type.cond_bam_input.all_in_one - #for $i, $bam in enumerate($analysis_type.cond_bam_input.input): - ln -s -f ${bam} input_${i}.bam && - ln -s -f ${bam.metadata.bam_index} input_${i}.bam.bai && - #end for - #else - ln -s -f ${analysis_type.cond_bam_input.input} input.bam && - ln -s -f ${analysis_type.cond_bam_input.input.metadata.bam_index} input.bam.bai && - #end if -]]> - </token> - <token name="@token_bam_input@"> -<![CDATA[ - #if $analysis_type.cond_bam_input.all_in_one - #for $i, $bam in enumerate($analysis_type.cond_bam_input.input): - --input_file input_${i}.bam - #end for - #else - --input_file input.bam - #end if - @token_reference_input@ -]]> - </token> - - <token name="@token_reference_input@"> -<![CDATA[ -]]> - </token> - <xml name="macro_input" tokens="tag"> - <yield /> - </xml> - - <xml name="macro_optional_parameters"> - <conditional name="optional_parameters"> - <param name="optional_parameters_enabled" type="boolean" label="Configure Optional Parameters" /> - <when value="true"> - <yield /> - </when> - <when value="false" /> - </conditional> - </xml> - - <xml name="macro_advanced_parameters"> - <conditional name="advanced_parameters"> - <param name="advanced_parameters_enabled" type="boolean" label="Configure Advanced Parameters" /> - <when value="true"> - <yield /> - </when> - <when value="false" /> - </conditional> - </xml> - - <xml name="macro_tests"> - <tests> - - </tests> - </xml> - -</macros>
--- a/GATK/gatk/generation/gatk.xsl Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,162 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> -<xsl:output - method="xml" - encoding="utf-8" - indent="yes" - cdata-section-elements="script style" /> - -<xsl:template match="/"> - -<tool id="gatk" name="GATK" version="@VERSION@.d9"> - <description>tool collection Version @VERSION@</description> - - <macros> - <import>gatk_macros.xml</import> - <xsl:for-each select="analyses/analysis"> - <import><xsl:value-of select="macro_file" /></import> - </xsl:for-each> - </macros> - - <expand macro="requirements" /> - - <stdio> - <regex match="^INFO" level="log" /> - <regex match="^WARN" level="warning" /> - <regex match="Using .* implementation of PairHMM" level="warning" /> - <regex match="There is insufficient memory for the Java Runtime Environment to continue" level="fatal" /> - <regex match="^##### ERROR" level="fatal" /> - <exit_code range="1:" level="fatal"/> - </stdio> - - <command> -<xsl:text disable-output-escaping="yes"><![CDATA[ - ############################ - ## import analysis specific preprocessings by using cheetahs internal searchList - ## if not defined, ignore - ############################ - #if $analysis_type.analysis_type_selector + "Preprocessing" in vars()['SL'][2] - #set $analysisPreprocessing = vars()['SL'][2][$analysis_type.analysis_type_selector + "Preprocessing"] - #include source=$analysisPreprocessing - #end if - - ############################ - ## GATK tool unspecific options - ############################ - @GATK_EXEC@ - - --analysis_type ${analysis_type.analysis_type_selector} - --reference_sequence ${ref_file.fields.path} - - --log_to_file ${output_log} - - #if $cond_intervals.cond_intervals_enabled - #for $interval in $cond_intervals.intervals: - --intervals ${interval.L} - #end for - #end if - - #if $cond_BQSR.cond_BQSR_enabled - --BQSR $cond_BQSR.BQSR - #end if - - ############################ - ## import analysis specific options by using cheetahs internal searchList - ## if not defined throw raw python error until better idea - ############################ - #if $analysis_type.analysis_type_selector + "Options" in vars()['SL'][2] - #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] - #include source=$analysisOptions - #else - #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] - #end if - - ############################ - ## only put ERROR or FATAL log messages into stderr - ## but keep full log for printing into log file - ############################ - 2>&1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2 -]]></xsl:text> - </command> - - <inputs> - - <param name="ref_file" type="select" label="Using reference genome" help="-R,‑‑reference_sequence &lt;reference_sequence&gt;" > - <options from_data_table="picard_indexes"> - <!--filter type="data_meta" key="dbkey" ref="@TAG@_input" column="dbkey" /--> - </options> - <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> - </param> - - <conditional name="cond_intervals"> - <param name="cond_intervals_enabled" type="boolean" label="Select interval subset to operate on?" /> - <when value="true"> - <repeat name="intervals" title="genomic interval over which to operate" help="-L,‑‑intervals &lt;intervals&gt;"> - <param name="L" type="text" value="" /> - </repeat> - </when> - <when value="false" /> - </conditional> - - <conditional name="cond_BQSR"> - <param name="cond_BQSR_enabled" type="boolean" label="Select covariates for on-the-fly recalibration?" /> - <when value="true"> - <param name="BQSR" type="data" format="tabular" label="Input covariates table file for on-the-fly base quality score recalibration" help="-BQSR,‑‑BQSR &lt;BQSR&gt; intended primarily for use with BaseRecalibrator and PrintReads" /> - </when> - <when value="false" /> - </conditional> - - <conditional name="cond_threads"> - <param name="cond_threads_enabled" type="boolean" label="Set computational options (cpu, mem)?" /> - <when value="true"> - <param name="nt" type="integer" value="1" label="Number of data threads to allocate to this analysis" help="make sure, the option is available for the chosen tool" /> - <param name="nct" type="integer" value="1" label="Number of CPU threads to allocate per data thread" help="make sure, the option is available for the chosen tool" /> - <param name="mem" type="integer" value="0" label="Overwrite Memory in MB (0 = don't overwrite)" help="Overwrites all other defaults and might lead to crash the run. States mem per data thread" /> - </when> - <when value="false" /> - </conditional> - - <conditional name="analysis_type"> - <param name="analysis_type_selector" type="select" label="Analysis Type"> - <xsl:for-each select="analyses/analysis"> - <option value="{name}"><xsl:value-of select="name" /></option> - </xsl:for-each> - </param> - <xsl:for-each select="analyses/analysis"> - <when value="{name}"> - <!--xsl:choose> - <xsl:when test="input_type = 'bam'"> - <expand macro="macro_bam_input" tag="{tag}" /> - </xsl:when> - <xsl:when test="input_type = 'gvcf'"> - <expand macro="macro_gvcf_input" tag="{tag}" /> - </xsl:when> - </xsl:choose--> - <expand macro="{name}Parameters" tag="{tag}" /> - </when> - </xsl:for-each> - </conditional> - </inputs> - - <outputs> - <xsl:for-each select="analyses/analysis"> - <expand macro="{name}Output" tag="{tag}"> - <filter>analysis_type['analysis_type_selector'] == '<xsl:value-of select="name" />'</filter> - </expand> - </xsl:for-each> - <data format="txt" name="output_log" label="${{tool.name}} - ${{analysis_type.analysis_type_selector}} on ${{on_string}} (log)" /> - </outputs> - - <expand macro="macro_tests" /> - - <citations> - <citation type="doi">10.1101/gr.107524.110</citation> - <citation type="doi">10.1038/ng.806</citation> - <citation type="doi">10.1002/0471250953.bi1110s43</citation> - </citations> -</tool> - -</xsl:template> -</xsl:stylesheet> - -
--- a/GATK/gatk/generation/gatk.xsldb.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<analyses> - <analysis> - <name>RealignerTargetCreator</name> - <input_type>bam</input_type> - <tag>rtc</tag> - <macro_file>realigner_target_creator.xml</macro_file> - </analysis> - <analysis> - <name>IndelRealigner</name> - <input_type>bam</input_type> - <tag>ir</tag> - <macro_file>indel_realigner.xml</macro_file> - </analysis> - <analysis> - <name>BaseRecalibrator</name> - <input_type>bam</input_type> - <tag>br</tag> - <macro_file>base_recalibrator.xml</macro_file> - </analysis> - <analysis> - <name>AnalyzeCovariates</name> - <input_type>bam</input_type> - <tag>ac</tag> - <macro_file>analyze_covariates.xml</macro_file> - </analysis> - <analysis> - <name>PrintReads</name> - <input_type>bam</input_type> - <tag>pr</tag> - <macro_file>print_reads.xml</macro_file> - </analysis> - <analysis> - <name>HaplotypeCaller</name> - <input_type>bam</input_type> - <tag>hc</tag> - <macro_file>haplotype_caller.xml</macro_file> - </analysis> - <analysis> - <name>GenotypeGVCFs</name> - <input_type>gvcf</input_type> - <tag>gg</tag> - <macro_file>genotype_gvcfs.xml</macro_file> - </analysis> - <analysis> - <name>CombineGVCFs</name> - <input_type>gvcf</input_type> - <tag>cg</tag> - <macro_file>combine_gvcfs.xml</macro_file> - </analysis> - <analysis> - <name>CombineVariants</name> - <input_type>vcf</input_type> - <tag>cv</tag> - <macro_file>combine_variants.xml</macro_file> - </analysis> -</analyses>
--- a/GATK/gatk/genotype_gvcfs.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -<macros> - <xml name="GenotypeGVCFsParameters" tokens="tag"> - - <expand macro="macro_gvcf_input" tag="@TAG@" /> - - <expand macro="macro_optional_parameters"> - - - <param name="sample_ploidy" type="integer" value="2" label="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy)" help="-ploidy,‑‑sample_ploidy &lt;sample_ploidy&gt;" /> - - </expand> - - </xml> - - <xml name="GenotypeGVCFsOutput"> - <data format="vcf" name="gg_output_gvcf" from_work_dir="output.g.vcf" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (gVCF)"> - <yield /> - </data> - </xml> - - <template name="GenotypeGVCFsPreprocessing"> -<![CDATA[ - @token_gvcf_input_pre@ -]]> - </template> - - <template name="GenotypeGVCFsOptions"> -<![CDATA[ - --out output.g.vcf - - @token_gvcf_input@ - - #set $optionals = $analysis_type.optional_parameters - #if $optionals.optional_parameters_enabled - --sample_ploidy $optionals.sample_ploidy - #end if -]]> - </template> - - -</macros> - -
--- a/GATK/gatk/haplotype_caller.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ -<macros> - <xml name="HaplotypeCallerParameters" tokens="tag"> - - <expand macro="macro_bam_input" tag="@TAG@" /> - - <conditional name="cond_usage"> - <param name="cond_usage_selector" type="select" label="Select usage"> - <option value="GVCF">Single-sample all-sites calling on DNAseq (GVCF mode)</option> - </param> - <when value="GVCF"> - <expand macro="HaplotypeCallerGVCF" /> - </when> - </conditional> - - <expand macro="macro_optional_parameters"> - - <param name="sample_ploidy" type="integer" value="2" label="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy)" help="-ploidy,‑‑sample_ploidy &lt;sample_ploidy&gt;" /> - - </expand> - - </xml> - - <xml name="HaplotypeCallerOutput"> - <data format="vcf" name="hc_output_gvcf" from_work_dir="output.g.vcf" label="${tool.name} on ${on_string} (gVCF)"> - <yield /> - </data> - </xml> - - <template name="HaplotypeCallerPreprocessing"> -<![CDATA[ - @token_bam_input_pre@ -]]> - </template> - - <template name="HaplotypeCallerOptions"> -<![CDATA[ - --out output.g.vcf - - @token_bam_input@ - - #set $optionals = $analysis_type.optional_parameters - #if $optionals.optional_parameters_enabled - --sample_ploidy $optionals.sample_ploidy - #end if - - #set $usage_selector = $analysis_type.cond_usage.cond_usage_selector - #set $usage = $analysis_type.cond_usage - - #if str($usage_selector) == 'GVCF' - --emitRefConfidence "GVCF" - #end if -]]> - </template> - - - - <xml name="HaplotypeCallerGVCF"> - <param name="emitRefConfidence" type="select" optional="true" label="Mode for emitting reference confidence scores" help="-ERC,‑‑emitRefConfidence &lt;emitRefConfidence&gt;"> - <option value="GVCF">GVCF (Reference model emitted with condensed non-variant blocks)</option> - </param> - </xml> - -</macros> - -
--- a/GATK/gatk/indel_realigner.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -<macros> - <xml name="IndelRealignerParameters" tokens="tag"> - - <expand macro="macro_bam_input" tag="@TAG@" /> - - <param name="targetIntervals" type="data" format="gatk_interval" label="Intervals file output from RealignerTargetCreator" help="-targetIntervals,--targetIntervals &lt;targetIntervals&gt;" /> - - <expand macro="macro_optional_parameters"> - <repeat name="knownAlleles" title="Input VCF file(s) with known indels" help="-known,‑‑knownAlleles &lt;knownAlleles&gt;"> - <param name="knownAllele" type="data" format="vcf" label="Variant file (VCF format)" /> - </repeat> - - <param name="consensusDeterminationModel" type="select" label="minimum reads at a locus to enable using the entropy calculation" help="-model,‑‑consensusDeterminationModel &lt;consensusDeterminationModel&gt;"> - <option value="USE_READS">USE_READS - Additionally uses indels already present in the original alignments of the reads</option> - <option value="KNOWNS_ONLY">KNOWNS_ONLYS - Uses only indels from a provided ROD of known indels</option> - <option value="USE_SW">USE_SW - Additionally uses 'Smith-Waterman' to generate alternate consenses</option> - </param> - <param name="LODThresholdForCleaning" type="float" value="5.0" label="LOD threshold above which the cleaner will clean" help="-LOD,‑‑LODThresholdForCleaning &lt;LODThresholdForCleaning&gt;" /> - <!--param name="nWayOut" type="float" value="5.0" label="Generate one output file for each input (-I) bam file (not compatible with -output)" help="-nWayOut,--nWayOut &lt;nWayOut&gt;" /--> - </expand> - - - <expand macro="macro_advanced_parameters"> - <param name="entropyThreshold" type="float" value="0.15" label="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)" help="-entropy,‑‑entropyThreshold &lt;entropyThreshold&gt;" /> - - <param name="maxConsensuses" type="integer" value="30" label="Max alternate consensuses to try (necessary to improve performance in deep coverage)" help="-maxConsensuses,‑‑maxConsensuses &lt;maxConsensuses&gt;" /> - - <param name="maxIsizeForMovement" type="integer" value="3000" label="maximum insert size of read pairs that we attempt to realign" help="-maxIsize,‑‑maxIsizeForMovement &lt;maxIsizeForMovement&gt;" /> - - <param name="maxPositionalMoveAllowed" type="integer" value="200" label="Maximum positional move in basepairs that a read can be adjusted during realignment" help="-maxPosMove,‑‑maxPositionalMoveAllowed &lt;maxPositionalMoveAllowed&gt;" /> - - <param name="maxReadsForConsensuses" type="integer" value="120" label="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)" help="-greedy,‑‑maxReadsForConsensuses &lt;maxReadsForConsensuses&gt;" /> - - <param name="maxReadsForRealignment" type="integer" value="20000" label="Max reads allowed at an interval for realignment" help="-maxReads,‑‑maxReadsForRealignment &lt;maxReadsForRealignment&gt;" /> - - <param name="maxReadsInMemory" type="integer" value="150000" label="max reads allowed to be kept in memory at a time by the SAMFileWriter" help="-maxInMemory,‑‑maxReadsInMemory &lt;maxReadsInMemory&gt;" /> - - <param name="noOriginalAlignmentTags" type="boolean" truevalue="--noOriginalAlignmentTags" falsevalue="" label="Don't output the original cigar or alignment start tags for each realigned read in the output bam" help="-noTags,‑‑noOriginalAlignmentTags" /> - </expand> - - </xml> - - <xml name="IndelRealignerOutput"> - <data format="bam" name="ir_output_bam" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (BAM)"> - <yield /> - </data> - </xml> - - <template name="IndelRealignerPreprocessing"> -<![CDATA[ - @token_bam_input_pre@ - - ln -s -f ${analysis_type.targetIntervals} target.intervals && - - #if $analysis_type.optional_parameters.optional_parameters_enabled - #for $i, $knownAllele in enumerate($analysis_type.optional_parameters.knownAlleles): - ln -s -f ${knownAllele.knownAllele} knownAllele_${i}.vcf && - #end for - #end if -]]> - </template> - - <template name="IndelRealignerOptions"> -<![CDATA[ - --out ${ir_output_bam} - - @token_bam_input@ - --targetIntervals target.intervals - - #if $analysis_type.optional_parameters.optional_parameters_enabled - #for $i, $knownAllele in enumerate($analysis_type.optional_parameters.knownAlleles): - --knownAlleles knownAllele_${i}.vcf - #end for - --consensusDeterminationModel ${analysis_type.consensusDeterminationModel} - --LODThresholdForCleaning ${analysis_type.LODThresholdForCleaning} - #end if - - #if $analysis_type.advanced_parameters.advanced_parameters_enabled - --entropyThreshold ${analysis_type.advanced_parameters.entropyThreshold} - --maxConsensuses ${analysis_type.advanced_parameters.maxConsensuses} - --maxIsizeForMovement ${analysis_type.advanced_parameters.maxIsizeForMovement} - --maxPositionalMoveAllowed ${analysis_type.advanced_parameters.maxPositionalMoveAllowed} - --maxReadsForConsensuses ${analysis_type.advanced_parameters.maxReadsForConsensuses} - --maxReadsForRealignment ${analysis_type.advanced_parameters.maxReadsForRealignment} - --maxReadsInMemory ${analysis_type.advanced_parameters.maxReadsInMemory} - ${analysis_type.advanced_parameters.noOriginalAlignmentTags} - #end if -]]> - </template> -</macros>
--- a/GATK/gatk/print_reads.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -<macros> - <xml name="PrintReadsParameters" tokens="tag"> - - <expand macro="macro_bam_input" tag="@TAG@" /> - - <!-- BQSR in main config --> - - <expand macro="macro_optional_parameters"> - - <param name="number" type="integer" value="" optional="true" label="Print the first n reads from the file, discarding the rest" help="-n,‑‑number &lt;number&gt;" /> - - <param name="platform" type="text" value="" optional="true" label="Exclude all reads with this platform from the output" help="-platform,‑‑platform &lt;platform&gt;" /> - - <param name="readGroup" type="text" value="" optional="true" label="Exclude all reads with this read group from the output" help="-readGroup,‑‑readGroup &lt;readGroup&gt;" /> - - <param name="sample_file" type="data" format="txt" optional="true" label="File containing a list of samples (one per line). Can be specified multiple times" help="-sf,‑‑sample_file &lt;sample_file&gt;" /> - - <repeat name="sample_names" title="Sample names to be included in the analysis" help="-sn,‑‑sample_name &lt;sample_name&gt;"> - <param name="sample_name" type="text" value="" title="Sample name to be included in the analysis" /> - </repeat> - - <param name="simplify" type="text" truevalue="-s" falsevalue="" label="Erase all extra attributes in the read but keep the read group information" help="-s,‑‑simplify" /> - - </expand> - - </xml> - - <xml name="PrintReadsOutput"> - <data format="bam" name="pr_output_bam" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (BAM)"> - <yield /> - </data> - </xml> - - <template name="PrintReadsPreprocessing"> -<![CDATA[ - @token_bam_input_pre@ -]]> - </template> - - <template name="PrintReadsOptions"> -<![CDATA[ - --out ${pr_output_bam} - - @token_bam_input@ - - #set $optionals = $analysis_type.optional_parameters - #if $optionals.optional_parameters_enabled - #if int($optionals.number) > 0 - --number $optionals.number - #end if - #if str($optionals.platform) - --platform $optionals.platform - #end if - #if str($optionals.readGroup) - --readGroup $optionals.readGroup - #end if - #if $optionals.sample_file - --sample_file $optionals.sample_file - #end if - #if $optionals.sample_names - #for $sample in $optionals.sample_names: - --intervals ${sample.sample_name} - #end for - #end if - $optionals.simplify - #end if -]]> - </template> -</macros> - -
--- a/GATK/gatk/realigner_target_creator.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -<macros> - <xml name="RealignerTargetCreatorParameters" tokens="tag"> - - <expand macro="macro_bam_input" tag="@TAG@" /> - - <expand macro="macro_optional_parameters"> - <param name="maxIntervalSize" type="integer" value="500" label="maximum interval size; any intervals larger than this value will be dropped" help="-maxInterval,‑‑maxIntervalSize &lt;maxIntervalSize&gt;" /> - <param name="minReadsAtLocus" type="integer" value="4" label="minimum reads at a locus to enable using the entropy calculation" help="-minReads,‑‑minReadsAtLocus &lt;minReadsAtLocus&gt;" /> - <param name="windowSize" type="integer" value="10" label="window size for calculating entropy or SNP clusters" help="-window,‑‑windowSize &lt;windowSize&gt;" /> - - <param name="mismatchFraction" type="float" value="0.0" label="fraction of base qualities needing to mismatch for a position to have high entropy" help="-mismatch,‑‑mismatchFraction &lt;mismatchFraction&gt;" /> - <repeat name="rod_bindings" title="Input VCF files with known indels" help="-known,‑‑known &lt;known&gt;"> - <param name="input_rod" type="data" format="vcf" label="Variant file (VCF format)" /> - </repeat> - </expand> - - </xml> - - <xml name="RealignerTargetCreatorOutput"> - <data format="gatk_interval" name="rtc_output_intervals" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (GATK intervals)"> - <yield /> - </data> - </xml> - - <template name="RealignerTargetCreatorPreprocessing"> -<![CDATA[ - @token_bam_input_pre@ - - #if $analysis_type.optional_parameters.optional_parameters_enabled - #for $i, $rod_binding in enumerate($analysis_type.optional_parameters.rod_bindings): - ln -s -f ${rod_binding.input_rod} rod_binding_${i}.vcf && - #end for - #end if -]]> - </template> - - <template name="RealignerTargetCreatorOptions"> -<![CDATA[ - --out ${rtc_output_intervals} - - @token_bam_input@ - - #if $analysis_type.optional_parameters.optional_parameters_enabled - --maxIntervalSize ${analysis_type.optional_parameters.maxIntervalSize} - --minReadsAtLocus ${analysis_type.optional_parameters.minReadsAtLocus} - --windowSize ${analysis_type.optional_parameters.windowSize} - --mismatchFraction ${analysis_type.optional_parameters.mismatchFraction} - - #for $i, $rod_binding in enumerate($analysis_type.optional_parameters.rod_bindings): - --known rod_binding_${i}.vcf - #end for - #end if -]]> - </template> -</macros> - -
--- a/GATK/gatk/tool-data/destinations.py Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -from galaxy.jobs import JobDestination -import os -import sys -import json -import cStringIO -import logging - -log = logging.getLogger( __name__ ) - - -def dump(obj, nested_level=0, output=sys.stdout): - spacing = ' ' - if type(obj) == dict: - print >> output, '%s{' % ((nested_level) * spacing) - for k, v in obj.items(): - if hasattr(v, '__iter__'): - print >> output, '%s%s:' % ((nested_level + 1) * spacing, k) - dump(v, nested_level + 1, output) - else: - print >> output, '%s%s: %s' % ((nested_level + 1) * spacing, k, v) - print >> output, '%s}' % (nested_level * spacing) - elif type(obj) == list: - print >> output, '%s[' % ((nested_level) * spacing) - for v in obj: - if hasattr(v, '__iter__'): - dump(v, nested_level + 1, output) - else: - print >> output, '%s%s' % ((nested_level + 1) * spacing, v) - print >> output, '%s]' % ((nested_level) * spacing) - else: - print >> output, '%s%s' % (nested_level * spacing, obj) - - -def dynamic_slurm_cluster_gatk(job, tool_id): - # Allocate extra time - inp_data = dict( [ ( da.name, da.dataset ) for da in job.input_datasets ] ) - inp_data.update( [ ( da.name, da.dataset ) for da in job.input_library_datasets ] ) - inp_data.update( [ ( da.name, json.loads(da.value) ) for da in job.parameters ] ) - out = cStringIO.StringIO() - dump(inp_data, 1, out) - log.debug(out.getvalue()) - - nativeSpecs = '--nodes=1 --ntasks=1' - - # runner doesn't allow to specify --cpus-per-task - # thus the mem calculation gets messy with more than 1 node - # --> translate nt ==> nodes, nct ==> ntasks - - if 'cond_threads' not in inp_data: - return JobDestination(runner="slurm") - - if inp_data['cond_threads']['cond_threads_enabled'] == "True": - nNodes = int(inp_data['cond_threads']['nt']) - nCPU = int(inp_data['cond_threads']['nct']) - nMEM = int(inp_data['cond_threads']['mem']) - if nMEM > 0: - nativeSpecs = '--nodes=%d --ntasks=%d --mem=%d' % (nNodes, nCPU*nNodes, nMEM) - else: - nativeSpecs = '--nodes=%d --ntasks=%d' % (nNodes, nCPU*nNodes) - - return JobDestination(runner="slurm", params={"nativeSpecification": nativeSpecs}) -
--- a/GATK/gatk/tool-data/picard_index.loc.sample Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -#This is a sample file distributed with Galaxy that enables tools -#to use a directory of Picard dict and associated files. You will need -#to create these data files and then create a picard_index.loc file -#similar to this one (store it in this directory) that points to -#the directories in which those files are stored. The picard_index.loc -#file has this format (longer white space is the TAB character): -# -#<unique_build_id> <dbkey> <display_name> <fasta_file_path> -# -#So, for example, if you had hg18 indexed and stored in -#/depot/data2/galaxy/srma/hg18/, -#then the srma_index.loc entry would look like this: -# -#hg18 hg18 hg18 Pretty /depot/data2/galaxy/picard/hg18/hg18.fa -# -#and your /depot/data2/galaxy/srma/hg18/ directory -#would contain the following three files: -#hg18.fa -#hg18.dict -#hg18.fa.fai -# -#The dictionary file for each reference (ex. hg18.dict) must be -#created via Picard (http://picard.sourceforge.net). Note that -#the dict file does not have the .fa extension although the -#path list in the loc file does include it. -#
--- a/GATK/gatk/tool_data_table_conf.xml.sample Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -<tables> - <!-- Location of Picard dict files valid for GATK --> - <table name="picard_indexes" comment_char="#"> - <columns>value, dbkey, name, path</columns> - <file path="tool-data/picard_index.loc" /> - </table> -</tables>
--- a/GATK/gatk/tool_dependencies.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <set_environment version="1.0"> - <environment_variable action="set_to" name="GATK_PATH">/mnt/galaxy/tools/GATK/3.4-0</environment_variable> - </set_environment> - <!-- - Use GATK_SITE_OPTIONS to set additional parameters that should be inserted in every GATK call. - The intended use case was to prohibit GATK to collect and send data. - For example: - - -et "NO_ET" -K "/data/gatk_key_file" ##ET no phone home - --> - <set_environment version="1.0"> - <environment_variable action="set_to" name="GATK_SITE_OPTIONS"> </environment_variable> - </set_environment> - <package name="package_r_for_gatk_3_4_0" version="3.1.2.1"> - <repository changeset_revision="49c62e9b71ad" name="package_r_for_gatk_3_4_0" owner="avowinkel" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>
--- a/GATK/package_picard_1_135/tool_dependencies.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="picard" version="1.135"> - <install version="1.0"> - <actions_group> - <actions architecture="x86_64" os="linux"> - <action type="download_by_url">https://github.com/broadinstitute/picard/releases/download/1.135/picard-tools-1.135.zip</action> - <action type="move_directory_files"> - <source_directory>.</source_directory> - <destination_directory>$INSTALL_DIR</destination_directory> - </action> - </actions> - <action type="set_environment"> - <environment_variable name="JAVA_JAR_PATH" action="set_to">$INSTALL_DIR</environment_variable> - </action> - </actions_group> - </install> - <readme> -This picard package dependency is retrieved directly from https://github.com/broadinstitute/picard/releases - </readme> - </package> -</tool_dependency>
--- a/GATK/package_r_for_gatk_3_4_0/tool_dependencies.xml Thu Sep 12 06:50:21 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="R" version="3.1.2"> - <repository changeset_revision="9f2fddb9d6e2" name="package_r_3_1_2" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> - <package name="package_r_for_gatk_3_4_0" version="3.1.2.1"> - <install version="1.0"> - <actions> - <action type="setup_r_environment"> - - <repository changeset_revision="9f2fddb9d6e2" name="package_r_3_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> - <package name="R" version="3.1.2" /> - </repository> - <package>https://github.com/cran/stringi/archive/0.5-5.tar.gz</package> - <package>https://github.com/cran/magrittr/archive/1.5.tar.gz</package> - <package>https://github.com/cran/stringr/archive/1.0.0.tar.gz</package> - <package>https://github.com/cran/RColorBrewer/archive/1.1-2.tar.gz</package> - <package>https://github.com/cran/dichromat/archive/2.0-0.tar.gz</package> - <package>https://github.com/cran/colorspace/archive/1.2-6.tar.gz</package> - <package>https://github.com/cran/munsell/archive/0.4.2.tar.gz</package> - <package>https://github.com/cran/labeling/archive/0.3.tar.gz</package> - <package>https://github.com/cran/Rcpp/archive/0.11.6.tar.gz</package> - <package>https://github.com/cran/digest/archive/0.6.8.tar.gz</package> - <package>https://github.com/cran/gtable/archive/0.1.2.tar.gz</package> - <package>https://github.com/cran/bitops/archive/1.0-6.tar.gz</package> - <package>https://github.com/cran/caTools/archive/1.17.1.tar.gz</package> - <package>https://github.com/cran/gtools/archive/3.5.0.tar.gz</package> - <package>https://github.com/cran/gdata/archive/2.17.0.tar.gz</package> - <package>https://github.com/cran/gsalib/archive/2.1.tar.gz</package> - <package>https://github.com/cran/gplots/archive/2.17.0.tar.gz</package> - <package>https://github.com/cran/plyr/archive/1.8.3.tar.gz</package> - <package>https://github.com/cran/reshape/archive/0.8.5.tar.gz</package> - <package>https://github.com/cran/reshape2/archive/1.4.1.tar.gz</package> - <package>https://github.com/cran/scales/archive/0.2.5.tar.gz</package> - <package>https://github.com/cran/proto/archive/0.3-10.tar.gz</package> - <package>https://github.com/cran/MASS/archive/7.3-43.tar.gz</package> - <package>https://github.com/cran/ggplot2/archive/1.0.1.tar.gz</package> - </action> - </actions> - </install> - <readme> - ggplot2 is a plotting system for R, based on the grammar of graphics, which tries to take the good parts of base and lattice graphics and none of the bad parts. - It takes care of many of the fiddly details that make plotting a hassle (like drawing legends) as well as providing a powerful model of graphics that makes it easy to produce complex multi-layered graphics. - - http://ggplot2.org/ - </readme> - </package> -</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/Haplotypecaller.ga Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,101 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "Haplotypecaller workflow.", + "format-version": "0.1", + "name": "Haplotypecaller", + "steps": { + "0": { + "annotation": "", + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "Input Dataset" + } + ], + "label": null, + "name": "Input dataset", + "outputs": [], + "position": { + "left": 198, + "top": 208.1666717529297 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"Input Dataset\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [], + "uuid": "b9fb671f-f07f-498c-8c85-619987501313" + }, + "1": { + "annotation": "", + "id": 1, + "input_connections": { + "analysis_type|cond_bam_input|input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "GATK", + "outputs": [ + { + "name": "rtc_output_intervals", + "type": "gatk_interval" + }, + { + "name": "ir_output_bam", + "type": "bam" + }, + { + "name": "br_table", + "type": "tabular" + }, + { + "name": "ac_plotsReportFile", + "type": "pdf" + }, + { + "name": "pr_output_bam", + "type": "bam" + }, + { + "name": "hc_output_gvcf", + "type": "vcf" + }, + { + "name": "gg_output_gvcf", + "type": "vcf" + }, + { + "name": "cg_output_vcf", + "type": "vcf" + }, + { + "name": "cv_output_vcf", + "type": "vcf" + }, + { + "name": "output_log", + "type": "txt" + } + ], + "position": { + "left": 484.83331298828125, + "top": 200 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/avowinkel/gatk/gatk/3.4-0.d9", + "tool_state": "{\"__page__\": 0, \"cond_threads\": \"{\\\"cond_threads_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\", \"ref_file\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"\\\"}\", \"__rerun_remap_job_id__\": null, \"cond_BQSR\": \"{\\\"cond_BQSR_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\", \"analysis_type\": \"{\\\"analysis_type_selector\\\": \\\"HaplotypeCaller\\\", \\\"cond_usage\\\": {\\\"emitRefConfidence\\\": \\\"GVCF\\\", \\\"__current_case__\\\": 0, \\\"cond_usage_selector\\\": \\\"GVCF\\\"}, \\\"optional_parameters\\\": {\\\"optional_parameters_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}, \\\"cond_bam_input\\\": {\\\"input\\\": null, \\\"all_in_one\\\": \\\"False\\\", \\\"__current_case__\\\": 1}, \\\"__current_case__\\\": 5}\", \"cond_intervals\": \"{\\\"cond_intervals_enabled\\\": \\\"False\\\", \\\"__current_case__\\\": 1}\"}", + "tool_version": "3.4-0.d9", + "type": "tool", + "user_outputs": [], + "uuid": "5988799c-5658-4475-89ee-86f267cfbee1" + } + }, + "uuid": "e4f99de3-d6b0-4554-9562-60b2b0eed2d3" +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/SAMTools.ga Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,160 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "SAMTools workflow", + "format-version": "0.1", + "name": "SAMTools", + "steps": { + "0": { + "annotation": "", + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "Input Dataset" + } + ], + "label": null, + "name": "Input dataset", + "outputs": [], + "position": { + "left": 162, + "top": 169 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"Input Dataset\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [], + "uuid": "1eec712c-2fe0-43a8-b033-f027a74c6a48" + }, + "1": { + "annotation": "", + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "Input Dataset" + } + ], + "label": null, + "name": "Input dataset", + "outputs": [], + "position": { + "left": 273.83331298828125, + "top": 341.83331298828125 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"Input Dataset\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [], + "uuid": "98d5e3bf-4715-48a2-9b2b-29b21179c425" + }, + "2": { + "annotation": "", + "id": 2, + "input_connections": { + "input1": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "Sort", + "outputs": [ + { + "name": "output1", + "type": "bam" + } + ], + "position": { + "left": 330, + "top": 222 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/samtools_sort/samtools_sort/2.0", + "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"input1\": \"null\", \"sort_mode\": \"\\\"\\\"\"}", + "tool_version": "2.0", + "type": "tool", + "user_outputs": [], + "uuid": "5e9aef3e-c2ad-43d1-b248-7d94d6a974bc" + }, + "3": { + "annotation": "", + "id": 3, + "input_connections": { + "reference_source|input_bams_0|input_bam": { + "id": 2, + "output_name": "output1" + }, + "reference_source|ref_file": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "MPileup", + "outputs": [ + { + "name": "output_mpileup", + "type": "pileup" + }, + { + "name": "output_log", + "type": "txt" + } + ], + "position": { + "left": 484.83331298828125, + "top": 313.83331298828125 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/samtools_mpileup/samtools_mpileup/2.0", + "tool_state": "{\"__page__\": 0, \"advanced_options\": \"{\\\"advanced_options_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 1}\", \"__rerun_remap_job_id__\": null, \"genotype_likelihood_computation_type\": \"{\\\"output_mapping_quality\\\": \\\"False\\\", \\\"genotype_likelihood_computation_type_selector\\\": \\\"do_not_perform_genotype_likelihood_computation\\\", \\\"__current_case__\\\": 1, \\\"base_position_on_reads\\\": \\\"False\\\"}\", \"reference_source\": \"{\\\"ref_file\\\": null, \\\"reference_source_selector\\\": \\\"history\\\", \\\"input_bams\\\": [{\\\"__index__\\\": 0, \\\"input_bam\\\": null}], \\\"__current_case__\\\": 1}\"}", + "tool_version": "2.0", + "type": "tool", + "user_outputs": [], + "uuid": "84c6d18e-1930-4eef-b58a-46c88a35339e" + }, + "4": { + "annotation": "", + "id": 4, + "input_connections": { + "input_file": { + "id": 3, + "output_name": "output_mpileup" + } + }, + "inputs": [], + "label": null, + "name": "Pileup to VCF", + "outputs": [ + { + "name": "output_file", + "type": "vcf" + } + ], + "position": { + "left": 562.8333129882812, + "top": 469.83331298828125 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/jjohnson/pileup_to_vcf/pileup_to_vcf/2.2", + "tool_state": "{\"snps_only\": \"\\\"False\\\"\", \"min_cvrg\": \"\\\"5\\\"\", \"allow_multiples\": \"\\\"False\\\"\", \"input_file\": \"null\", \"__page__\": 0, \"vcf_id\": \"\\\"\\\"\", \"__rerun_remap_job_id__\": null, \"cols\": \"{\\\"select_order\\\": \\\"no\\\", \\\"__current_case__\\\": 0}\", \"depth_as\": \"\\\"ref\\\"\", \"min_base_qual\": \"\\\"20\\\"\", \"min_var_pct\": \"\\\"0.5\\\"\"}", + "tool_version": "2.2", + "type": "tool", + "user_outputs": [], + "uuid": "335581bc-838e-431d-8247-1e1580260275" + } + }, + "uuid": "0c40a63d-b349-49b8-90b0-c927a05253ef" +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/SVDetect.ga Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,163 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "SVDetect workflow.", + "format-version": "0.1", + "name": "SVDetect", + "steps": { + "0": { + "annotation": "", + "id": 0, + "input_connections": {}, + "inputs": [], + "label": null, + "name": "Import data", + "outputs": [ + { + "name": "outbamfile", + "type": "bam" + }, + { + "name": "outlenfile", + "type": "len" + }, + { + "name": "outsvfile", + "type": "sv" + } + ], + "position": { + "left": 171.99998474121094, + "top": 168 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_import/1.0.0", + "tool_state": "{\"file_name\": \"\\\"file1\\\"\", \"__rerun_remap_job_id__\": null, \"type\": \"{\\\"file_type\\\": \\\"bam\\\", \\\"__current_case__\\\": 0}\", \"file_path\": \"\\\"\\\"\", \"__page__\": 0}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [], + "uuid": "296478bf-9b67-47e8-9685-ced05bad2857" + }, + "1": { + "annotation": "", + "id": 1, + "input_connections": {}, + "inputs": [], + "label": null, + "name": "Import data", + "outputs": [ + { + "name": "outbamfile", + "type": "bam" + }, + { + "name": "outlenfile", + "type": "len" + }, + { + "name": "outsvfile", + "type": "sv" + } + ], + "position": { + "left": 266.99998474121094, + "top": 430.99998474121094 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_import/1.0.0", + "tool_state": "{\"file_name\": \"\\\"file1\\\"\", \"__rerun_remap_job_id__\": null, \"type\": \"{\\\"file_type\\\": \\\"len\\\", \\\"__current_case__\\\": 1}\", \"file_path\": \"\\\"\\\"\", \"__page__\": 0}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [], + "uuid": "481cf408-cef1-4ccd-bd08-9394bac04050" + }, + "2": { + "annotation": "", + "id": 2, + "input_connections": { + "inputBam": { + "id": 0, + "output_name": "outbamfile" + } + }, + "inputs": [], + "label": null, + "name": "BAM preprocessing", + "outputs": [ + { + "name": "abBAM", + "type": "bam" + }, + { + "name": "log", + "type": "txt" + }, + { + "name": "normBAM", + "type": "bam" + } + ], + "position": { + "left": 344.49998474121094, + "top": 224 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_preprocessing/1.0.0", + "tool_state": "{\"__page__\": 0, \"isizeMin\": \"\\\"0\\\"\", \"newBam\": \"{\\\"__current_case__\\\": 1, \\\"pairNormal\\\": \\\"no\\\"}\", \"isizeMax\": \"\\\"10000\\\"\", \"inputBam\": \"null\", \"__rerun_remap_job_id__\": null, \"nbrePair\": \"\\\"1000000\\\"\", \"readType\": \"\\\"1\\\"\", \"sample_name\": \"\\\"sample\\\"\", \"pairType\": \"\\\"1\\\"\", \"foldPair\": \"\\\"3.0\\\"\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [], + "uuid": "01f2c8d8-123b-4eac-9c5e-6101e9264dbf" + }, + "3": { + "annotation": "", + "id": 3, + "input_connections": { + "cmap_file": { + "id": 1, + "output_name": "outlenfile" + }, + "mates_file": { + "id": 2, + "output_name": "abBAM" + } + }, + "inputs": [], + "label": null, + "name": "Detect clusters of anomalously mapped pairs", + "outputs": [ + { + "name": "sv_file", + "type": "sv" + }, + { + "name": "circos_file", + "type": "segdup" + }, + { + "name": "bed_file", + "type": "bed" + }, + { + "name": "log_file", + "type": "txt" + } + ], + "position": { + "left": 472.49998474121094, + "top": 375.99998474121094 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/bzeitouni/svdetect/svdetect_run_parallel/1.0.0", + "tool_state": "{\"__page__\": 0, \"sv_type\": \"\\\"all\\\"\", \"getLinks\": \"{\\\"window_size\\\": \\\"3000\\\", \\\"step_length\\\": \\\"250\\\", \\\"splitmate\\\": \\\"True\\\", \\\"__current_case__\\\": 1, \\\"linking\\\": \\\"linking\\\"}\", \"mates_file\": \"null\", \"__rerun_remap_job_id__\": null, \"getFilteredLinks\": \"{\\\"nb_pairs_threshold\\\": \\\"5\\\", \\\"filtering\\\": \\\"filtering\\\", \\\"chromosomes\\\": \\\"\\\", \\\"file_conversion\\\": {\\\"file_conversion_select\\\": \\\"do_not_convert\\\", \\\"__current_case__\\\": 0}, \\\"splitlink\\\": \\\"False\\\", \\\"filter1\\\": {\\\"strand_filtering\\\": \\\"strand\\\", \\\"final_score_threshold\\\": \\\"1.0\\\", \\\"__current_case__\\\": 1, \\\"filter2\\\": {\\\"nb_pairs_order_threshold\\\": \\\"2\\\", \\\"order_filtering\\\": \\\"order\\\", \\\"__current_case__\\\": 1, \\\"filter3\\\": {\\\"indel_sigma_threshold\\\": \\\"3.0\\\", \\\"dup_sigma_threshold\\\": \\\"3.0\\\", \\\"singleton_sigma_threshold\\\": \\\"4.0\\\", \\\"__current_case__\\\": 1, \\\"insert_size_filtering\\\": \\\"insert\\\"}, \\\"mu_length\\\": \\\"3000\\\", \\\"sigma_length\\\": \\\"250\\\"}}, \\\"__current_case__\\\": 1, \\\"links2SV\\\": \\\"True\\\"}\", \"read2_length\": \"\\\"50\\\"\", \"read1_length\": \"\\\"50\\\"\", \"sample_name\": \"\\\"sample\\\"\", \"mates_orientation\": \"\\\"FR\\\"\", \"cmap_file\": \"null\"}", + "tool_version": "1.0.0", + "type": "tool", + "user_outputs": [], + "uuid": "2edd1fca-fc2d-453e-aead-c95a2e250d0a" + } + }, + "uuid": "bb58fd42-403a-48ad-80d2-59d81f0c8f0b" +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/VARSCAN.ga Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,133 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "VARSCAN workflow.", + "format-version": "0.1", + "name": "VARSCAN", + "steps": { + "0": { + "annotation": "", + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "Input Dataset" + } + ], + "label": null, + "name": "Input dataset", + "outputs": [], + "position": { + "left": 161, + "top": 187 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"Input Dataset\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [], + "uuid": "72a1326a-cd57-4567-8c41-cc66411642be" + }, + "1": { + "annotation": "", + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "Input Dataset" + } + ], + "label": null, + "name": "Input dataset", + "outputs": [], + "position": { + "left": 163.83331298828125, + "top": 263.8333282470703 + }, + "tool_errors": null, + "tool_id": null, + "tool_state": "{\"name\": \"Input Dataset\"}", + "tool_version": null, + "type": "data_input", + "user_outputs": [], + "uuid": "d8444634-af6c-44c5-9b55-8c3284da2823" + }, + "2": { + "annotation": "", + "id": 2, + "input_connections": { + "reference_source|input_bams_0|input_bam": { + "id": 0, + "output_name": "output" + }, + "reference_source|ref_file": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "MPileup", + "outputs": [ + { + "name": "output_mpileup", + "type": "pileup" + }, + { + "name": "output_log", + "type": "txt" + } + ], + "position": { + "left": 370, + "top": 187 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/samtools_mpileup/samtools_mpileup/2.0", + "tool_state": "{\"__page__\": 0, \"advanced_options\": \"{\\\"advanced_options_selector\\\": \\\"basic\\\", \\\"__current_case__\\\": 1}\", \"__rerun_remap_job_id__\": null, \"genotype_likelihood_computation_type\": \"{\\\"output_mapping_quality\\\": \\\"False\\\", \\\"genotype_likelihood_computation_type_selector\\\": \\\"do_not_perform_genotype_likelihood_computation\\\", \\\"__current_case__\\\": 1, \\\"base_position_on_reads\\\": \\\"False\\\"}\", \"reference_source\": \"{\\\"ref_file\\\": null, \\\"reference_source_selector\\\": \\\"history\\\", \\\"input_bams\\\": [{\\\"__index__\\\": 0, \\\"input_bam\\\": null}], \\\"__current_case__\\\": 1}\"}", + "tool_version": "2.0", + "type": "tool", + "user_outputs": [], + "uuid": "26d34295-7783-40e1-950f-a485ff569f5b" + }, + "3": { + "annotation": "", + "id": 3, + "input_connections": { + "in_file": { + "id": 2, + "output_name": "output_mpileup" + } + }, + "inputs": [], + "label": null, + "name": "VarScan mpileup", + "outputs": [ + { + "name": "output", + "type": "vcf" + }, + { + "name": "log", + "type": "txt" + } + ], + "position": { + "left": 563.8333129882812, + "top": 364.83331298828125 + }, + "post_job_actions": {}, + "tool_errors": null, + "tool_id": "toolshed.g2.bx.psu.edu/repos/fcaramia/varscan/varscan_mpileup/2.3.5", + "tool_state": "{\"strand_filter\": \"\\\"1\\\"\", \"min_avg_qual\": \"\\\"15\\\"\", \"min_coverage\": \"\\\"8\\\"\", \"__page__\": 0, \"__rerun_remap_job_id__\": null, \"min_freq_for_hom\": \"\\\"0.75\\\"\", \"min_var_freq\": \"\\\"0.01\\\"\", \"exe_command\": \"\\\"mpileup2snp\\\"\", \"min_reads2\": \"\\\"2\\\"\", \"p_value\": \"\\\"0.99\\\"\", \"variants\": \"\\\"1\\\"\", \"vcf_sample_list\": \"null\", \"in_file\": \"null\"}", + "tool_version": "2.3.5", + "type": "tool", + "user_outputs": [], + "uuid": "5ff6cf71-5d74-42ad-bcdc-7c75891dc10e" + } + }, + "uuid": "5b7bf29e-1272-4211-b345-37695989c6fa" +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/analyze_covariates.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,37 @@ +<macros> + <xml name="AnalyzeCovariatesParameters" tokens="tag"> + + <!-- BQSR in main config --> + + <param name="afterReportFile" type="data" format="tabular" optional="true" label="file containing the BQSR second-pass report file" help="-after,‑‑afterReportFile &lt;afterReportFile&gt;" /> + + <param name="beforeReportFile" type="data" format="tabular" optional="true" label="file containing the BQSR first-pass report file" help="-before,‑‑beforeReportFile &lt;beforeReportFile&gt;" /> + + </xml> + + <xml name="AnalyzeCovariatesOutput"> + <data format="pdf" name="ac_plotsReportFile" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (PDF Recalibration Report)"> + <yield /> + </data> + </xml> + + <template name="AnalyzeCovariatesPreprocessing"> +<![CDATA[ +]]> + </template> + + <template name="AnalyzeCovariatesOptions"> +<![CDATA[ + --plotsReportFile ${ac_plotsReportFile} + + #if str($analysis_type.afterReportFile) + --afterReportFile $analysis_type.afterReportFile + #end if + #if str($analysis_type.beforeReportFile) + --beforeReportFile $analysis_type.beforeReportFile + #end if +]]> + </template> +</macros> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/base_recalibrator.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,38 @@ +<macros> + <xml name="BaseRecalibratorParameters" tokens="tag"> + + <expand macro="macro_bam_input" tag="@TAG@" /> + + <param name="knownSites" type="data" format="vcf,bcf,bed,pileup,tabular,table" label="Database of known Sites (ROD files; e.g. VCF format)" multiple="true" title="A database of known polymorphic sites to skip over in the recalibration algorithm" help="-knownSites,‑‑knownSites &lt;knownSites&gt;" /> + + </xml> + + <xml name="BaseRecalibratorOutput"> + <data format="tabular" name="br_table" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (Table)"> + <yield /> + </data> + </xml> + + <template name="BaseRecalibratorPreprocessing"> +<![CDATA[ + @token_bam_input_pre@ + #for $i, $variant in enumerate($analysis_type.knownSites): + ln -s -f ${variant} variant_${i}.vcf && + #end for +]]> + </template> + + <template name="BaseRecalibratorOptions"> +<![CDATA[ + --out ${br_table} + + @token_bam_input@ + + #for $i, $variant in enumerate($analysis_type.knownSites): + --knownSites variant_${i}.vcf + #end for +]]> + </template> +</macros> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/combine_gvcfs.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,44 @@ +<macros> + <xml name="CombineGVCFsParameters" tokens="tag"> + + <expand macro="macro_gvcf_input" tag="@TAG@" /> + + <expand macro="macro_optional_parameters"> + + <param name="breakBandsAtMultiplesOf" type="integer" value="0" label="If > 0, reference bands will be broken up at genomic positions that are multiples of this number" help="-breakBandsAtMultiplesOf,‑‑breakBandsAtMultiplesOf &lt;breakBandsAtMultiplesOf&gt;" /> + + </expand> + + </xml> + + <xml name="CombineGVCFsOutput"> + <data format="vcf" name="cg_output_vcf" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (VCF)"> + <yield /> + </data> + </xml> + + <template name="CombineGVCFsPreprocessing"> +<![CDATA[ + @token_gvcf_input_pre@ +]]> + </template> + + <template name="CombineGVCFsOptions"> +<![CDATA[ + --out ${cg_output_vcf} + + @token_gvcf_input@ + + #set $optionals = $analysis_type.optional_parameters + #if $optionals.optional_parameters_enabled + #if $optionals.breakBandsAtMultiplesOf > 0 + --breakBandsAtMultiplesOf $optionals.breakBandsAtMultiplesOf + #end if + #end if +]]> + </template> + + +</macros> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/combine_variants.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,96 @@ +<macros> + <xml name="CombineVariantsParameters" tokens="tag"> + + <expand macro="macro_vcf_input" tag="@TAG@" /> + + <expand macro="macro_optional_parameters"> + + <param name="filteredRecordsMergeType" type="select" label="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields" help="-filteredRecordsMergeType,‑‑filteredrecordsmergetype &lt;filteredrecordsmergetype&gt;"> + <option value="">No Selection</option> + <option value="KEEP_IF_ANY_UNFILTERED">KEEP_IF_ANY_UNFILTERED</option> + <option value="KEEP_IF_ALL_UNFILTERED">KEEP_IF_ALL_UNFILTERED</option> + <option value="KEEP_UNCONDITIONAL">KEEP_UNCONDITIONAL</option> + </param> + + <param name="genotypeMergeOptions" type="select" label="Determines how we should merge genotype records for samples shared across the ROD files" help="-genotypeMergeOptions,‑‑genotypemergeoption &lt;genotypemergeoption&gt;"> + <option value="">No Selection</option> + <option value="UNIQUIFY">UNIQUIFY</option> + <option value="PRIORITIZE">PRIORITIZE</option> + <option value="UNSORTED">UNSORTED</option> + <option value="REQUIRE_UNIQUE">REQUIRE_UNIQUE</option> + </param> + + <param name="minimumN" type="integer" value="1" optional="true" label="Combine variants and output site only if the variant is present in at least N input files" help="-minN,‑‑minimumN &lt;minimumN&gt;" /> + + <param name="rod_priority_list" type="text" value="" optional="true" label="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted" help="-priority,‑‑rod_priority_list &lt;rod_priority_list&gt;" /> + + <param name="setKey" type="text" value="" optional="true" label="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from" help="-setKey,‑‑setKey &lt;setKey&gt;" /> + + <param name="assumeIdenticalSamples" type="boolean" truevalue="--assumeIdenticalSamples" falsevalue="" label="If true, assume input VCFs have identical sample sets and disjoint calls" help="-assumeIdenticalSamples,‑‑assumeIdenticalSamples" /> + + <param name="excludeNonVariants" type="boolean" truevalue="--excludeNonVariants" falsevalue="" label="Don't include loci found to be non-variant after the combining procedure" help="-env,‑‑excludeNonVariants" /> + + <param name="filteredAreUncalled" type="boolean" truevalue="--filteredAreUncalled" falsevalue="" label="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF" help="-filteredAreUncalled,‑‑filteredAreUncalled" /> + + <param name="mergeInfoWithMaxAC" type="boolean" truevalue="--mergeInfoWithMaxAC" falsevalue="" label="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records." help="-mergeInfoWithMaxAC,‑‑mergeInfoWithMaxAC" /> + + <param name="minimalVCF" type="boolean" truevalue="--minimalVCF" falsevalue="" label="If true, then the output VCF will contain no INFO or genotype FORMAT fields" help="-minimalVCF,‑‑minimalVCF" /> + + <param name="printComplexMerges" type="boolean" truevalue="--printComplexMerges" falsevalue="" label="Print out interesting sites requiring complex compatibility merging" help="-printComplexMerges,‑‑printComplexMerges" /> + + <param name="suppressCommandLineHeader" type="boolean" truevalue="--suppressCommandLineHeader" falsevalue="" label="If true, do not output the header containing the command line used" help="-suppressCommandLineHeader,‑‑suppressCommandLineHeader" /> + + </expand> + + </xml> + + <xml name="CombineVariantsOutput"> + <data format="vcf" name="cv_output_vcf" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (VCF)"> + <yield /> + </data> + </xml> + + <template name="CombineVariantsPreprocessing"> +<![CDATA[ + @token_vcf_input_pre@ +]]> + </template> + + <template name="CombineVariantsOptions"> +<![CDATA[ + --out ${cv_output_vcf} + + @token_vcf_input@ + + #set $optionals = $analysis_type.optional_parameters + #if $optionals.optional_parameters_enabled + + #if $optionals.filteredRecordsMergeType + --filteredRecordsMergeType $optionals.filteredRecordsMergeType + #end if + #if $optionals.genotypeMergeOptions + --genotypeMergeOptions $optionals.genotypeMergeOptions + #end if + #if $optionals.minimumN != 1 + --minimumN $optionals.minimumN + #end if + #if $optionals.rod_priority_list + --rod_priority_list $optionals.rod_priority_list + #end if + + $optionals.assumeIdenticalSamples + $optionals.excludeNonVariants + $optionals.filteredAreUncalled + $optionals.mergeInfoWithMaxAC + $optionals.minimalVCF + $optionals.printComplexMerges + $optionals.suppressCommandLineHeader + + #end if +]]> + </template> + + +</macros> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/gatk.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,179 @@ +<?xml version="1.0" encoding="utf-8"?> +<tool id="gatk" name="GATK" version="@VERSION@.d9"> + <description>tool collection Version @VERSION@</description> + <macros> + <import>gatk_macros.xml</import> + <import>realigner_target_creator.xml</import> + <import>indel_realigner.xml</import> + <import>base_recalibrator.xml</import> + <import>analyze_covariates.xml</import> + <import>print_reads.xml</import> + <import>haplotype_caller.xml</import> + <import>genotype_gvcfs.xml</import> + <import>combine_gvcfs.xml</import> + <import>combine_variants.xml</import> + </macros> + <expand macro="requirements"/> + <stdio> + <regex match="^INFO" level="log"/> + <regex match="^WARN" level="warning"/> + <regex match="Using .* implementation of PairHMM" level="warning"/> + <regex match="There is insufficient memory for the Java Runtime Environment to continue" level="fatal"/> + <regex match="^##### ERROR" level="fatal"/> + <exit_code range="1:" level="fatal"/> + </stdio> + <command><![CDATA[ + ############################ + ## import analysis specific preprocessings by using cheetahs internal searchList + ## if not defined, ignore + ############################ + #if $analysis_type.analysis_type_selector + "Preprocessing" in vars()['SL'][2] + #set $analysisPreprocessing = vars()['SL'][2][$analysis_type.analysis_type_selector + "Preprocessing"] + #include source=$analysisPreprocessing + #end if + + ############################ + ## GATK tool unspecific options + ############################ + @GATK_EXEC@ + + --analysis_type ${analysis_type.analysis_type_selector} + --reference_sequence ${ref_file.fields.path} + + --log_to_file ${output_log} + + #if $cond_intervals.cond_intervals_enabled + #for $interval in $cond_intervals.intervals: + --intervals ${interval.L} + #end for + #end if + + #if $cond_BQSR.cond_BQSR_enabled + --BQSR $cond_BQSR.BQSR + #end if + + ############################ + ## import analysis specific options by using cheetahs internal searchList + ## if not defined throw raw python error until better idea + ############################ + #if $analysis_type.analysis_type_selector + "Options" in vars()['SL'][2] + #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] + #include source=$analysisOptions + #else + #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] + #end if + + ############################ + ## only put ERROR or FATAL log messages into stderr + ## but keep full log for printing into log file + ############################ + 2>&1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2 +]]></command> + <inputs> + <param name="ref_file" type="select" label="Using reference genome" help="-R,‑‑reference_sequence &lt;reference_sequence&gt;"> + <options from_data_table="picard_indexes"/> + <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> + </param> + <conditional name="cond_intervals"> + <param name="cond_intervals_enabled" type="boolean" label="Select interval subset to operate on?"/> + <when value="true"> + <repeat name="intervals" title="genomic interval over which to operate" help="-L,‑‑intervals &lt;intervals&gt;"> + <param name="L" type="text" value=""/> + </repeat> + </when> + <when value="false"/> + </conditional> + <conditional name="cond_BQSR"> + <param name="cond_BQSR_enabled" type="boolean" label="Select covariates for on-the-fly recalibration?"/> + <when value="true"> + <param name="BQSR" type="data" format="tabular" label="Input covariates table file for on-the-fly base quality score recalibration" help="-BQSR,‑‑BQSR &lt;BQSR&gt; intended primarily for use with BaseRecalibrator and PrintReads"/> + </when> + <when value="false"/> + </conditional> + <conditional name="cond_threads"> + <param name="cond_threads_enabled" type="boolean" label="Set computational options (cpu, mem)?"/> + <when value="true"> + <param name="nt" type="integer" value="1" label="Number of data threads to allocate to this analysis" help="make sure, the option is available for the chosen tool"/> + <param name="nct" type="integer" value="1" label="Number of CPU threads to allocate per data thread" help="make sure, the option is available for the chosen tool"/> + <param name="mem" type="integer" value="0" label="Overwrite Memory in MB (0 = don't overwrite)" help="Overwrites all other defaults and might lead to crash the run. States mem per data thread"/> + </when> + <when value="false"/> + </conditional> + <conditional name="analysis_type"> + <param name="analysis_type_selector" type="select" label="Analysis Type"> + <option value="RealignerTargetCreator">RealignerTargetCreator</option> + <option value="IndelRealigner">IndelRealigner</option> + <option value="BaseRecalibrator">BaseRecalibrator</option> + <option value="AnalyzeCovariates">AnalyzeCovariates</option> + <option value="PrintReads">PrintReads</option> + <option value="HaplotypeCaller">HaplotypeCaller</option> + <option value="GenotypeGVCFs">GenotypeGVCFs</option> + <option value="CombineGVCFs">CombineGVCFs</option> + <option value="CombineVariants">CombineVariants</option> + </param> + <when value="RealignerTargetCreator"> + <expand macro="RealignerTargetCreatorParameters" tag="rtc"/> + </when> + <when value="IndelRealigner"> + <expand macro="IndelRealignerParameters" tag="ir"/> + </when> + <when value="BaseRecalibrator"> + <expand macro="BaseRecalibratorParameters" tag="br"/> + </when> + <when value="AnalyzeCovariates"> + <expand macro="AnalyzeCovariatesParameters" tag="ac"/> + </when> + <when value="PrintReads"> + <expand macro="PrintReadsParameters" tag="pr"/> + </when> + <when value="HaplotypeCaller"> + <expand macro="HaplotypeCallerParameters" tag="hc"/> + </when> + <when value="GenotypeGVCFs"> + <expand macro="GenotypeGVCFsParameters" tag="gg"/> + </when> + <when value="CombineGVCFs"> + <expand macro="CombineGVCFsParameters" tag="cg"/> + </when> + <when value="CombineVariants"> + <expand macro="CombineVariantsParameters" tag="cv"/> + </when> + </conditional> + </inputs> + <outputs> + <expand macro="RealignerTargetCreatorOutput" tag="rtc"> + <filter>analysis_type['analysis_type_selector'] == 'RealignerTargetCreator'</filter> + </expand> + <expand macro="IndelRealignerOutput" tag="ir"> + <filter>analysis_type['analysis_type_selector'] == 'IndelRealigner'</filter> + </expand> + <expand macro="BaseRecalibratorOutput" tag="br"> + <filter>analysis_type['analysis_type_selector'] == 'BaseRecalibrator'</filter> + </expand> + <expand macro="AnalyzeCovariatesOutput" tag="ac"> + <filter>analysis_type['analysis_type_selector'] == 'AnalyzeCovariates'</filter> + </expand> + <expand macro="PrintReadsOutput" tag="pr"> + <filter>analysis_type['analysis_type_selector'] == 'PrintReads'</filter> + </expand> + <expand macro="HaplotypeCallerOutput" tag="hc"> + <filter>analysis_type['analysis_type_selector'] == 'HaplotypeCaller'</filter> + </expand> + <expand macro="GenotypeGVCFsOutput" tag="gg"> + <filter>analysis_type['analysis_type_selector'] == 'GenotypeGVCFs'</filter> + </expand> + <expand macro="CombineGVCFsOutput" tag="cg"> + <filter>analysis_type['analysis_type_selector'] == 'CombineGVCFs'</filter> + </expand> + <expand macro="CombineVariantsOutput" tag="cv"> + <filter>analysis_type['analysis_type_selector'] == 'CombineVariants'</filter> + </expand> + <data format="txt" name="output_log" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (log)"/> + </outputs> + <expand macro="macro_tests"/> + <citations> + <citation type="doi">10.1101/gr.107524.110</citation> + <citation type="doi">10.1038/ng.806</citation> + <citation type="doi">10.1002/0471250953.bi1110s43</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/gatk_macros.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,166 @@ +<macros> + + <xml name="requirements"> + <requirements> + <requirement type="package">gatk</requirement> + <requirement type="set_environment">GATK_PATH</requirement> + <requirement type="set_environment">GATK_SITE_OPTIONS</requirement> + <requirement type="package" version="3.1.2.1">package_r_for_gatk_3_4_0</requirement> + </requirements> + </xml> + + <xml name="version_command"> + <version_command><![CDATA[ @GATK_EXEC@ --help|grep '^The Genome' ]]></version_command> + </xml> + + <token name="@VERSION@">3.4-0</token> + <token name="@OUTPUT_NAME_PREFIX@">${tool.name} - ${analysis_type.analysis_type_selector}</token> + <token name="@GATK_EXEC@"> +<![CDATA[ + #if $cond_threads.cond_threads_enabled: + #if int($cond_threads.nct) > 1: + THREAD_STRING="-nct $cond_threads.nct" && + #end if + #if int($cond_threads.nt) > 1: + THREAD_STRING=$THREAD_STRING" -nt $cond_threads.nt" && + #end if + #if int($cond_threads.mem) > 0: + GATK_MEM=$cond_threads.mem && + #end if + #end if + java -Xmx\${GATK_MEM:-\${SLURM_MEM_PER_NODE:-4096}}M -jar "\$GATK_PATH/GenomeAnalysisTK.jar" \${THREAD_STRING:-} +]]> + </token> + + <xml name="macro_vcf_input" tokens="tag"> + <param name="input" type="data" format="vcf" multiple="true" label="Variant files (VCF format)" help="-V, ‑‑variant"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_data_table" table_name="picard_indexes" metadata_name="dbkey" metadata_column="dbkey" message="Sequences are not currently available for the specified build." /> + </param> + </xml> + <token name="@token_vcf_input_pre@" tokens="tag"> +<![CDATA[ + ############################ + ## create links to gVCF input files with correct extensions + ############################ + #for $i, $variant in enumerate($analysis_type.input): + ln -s -f ${variant} variant_${i}.vcf && + #end for +]]> + </token> + <token name="@token_vcf_input@"> +<![CDATA[ + #for $i, $variant in enumerate($analysis_type.input): + --variant variant_${i}.vcf + #end for + @token_reference_input@ +]]> + </token> + + + <xml name="macro_gvcf_input" tokens="tag"> + <param name="input" type="data" format="vcf" multiple="true" label="Variant files (gVCF format)" help="-V, ‑‑variant"> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_data_table" table_name="picard_indexes" metadata_name="dbkey" metadata_column="dbkey" message="Sequences are not currently available for the specified build." /> + </param> + </xml> + <token name="@token_gvcf_input_pre@" tokens="tag"> +<![CDATA[ + ############################ + ## create links to gVCF input files with correct extensions + ############################ + #for $i, $variant in enumerate($analysis_type.input): + ln -s -f ${variant} variant_${i}.g.vcf && + #end for +]]> + </token> + <token name="@token_gvcf_input@"> +<![CDATA[ + #for $i, $variant in enumerate($analysis_type.input): + --variant variant_${i}.g.vcf + #end for + @token_reference_input@ +]]> + </token> + + <xml name="macro_bam_input"> + <conditional name="cond_bam_input"> + <param name="all_in_one" type="boolean" value="false" label="Input all BAM files in a single command" /> + <when value="true"> + <param name="input" type="data" format="bam" multiple="true" label="Input file containing sequence data (BAM)" help="-I, ‑‑input_file"> + <validator type="unspecified_build"/> + <validator type="dataset_metadata_in_data_table" table_name="picard_indexes" metadata_name="dbkey" metadata_column="dbkey" message="Sequences are not currently available for the specified build."/> + </param> + </when> + <when value="false"> + <param name="input" type="data" format="bam" label="Input file containing sequence data (BAM)" help="-I, ‑‑input_file"> + <validator type="unspecified_build"/> + <validator type="dataset_metadata_in_data_table" table_name="picard_indexes" metadata_name="dbkey" metadata_column="dbkey" message="Sequences are not currently available for the specified build."/> + </param> + </when> + </conditional> + </xml> + <token name="@token_bam_input_pre@"> +<![CDATA[ + ############################ + ## create links to bam input files with correct extensions + ############################ + #if $analysis_type.cond_bam_input.all_in_one + #for $i, $bam in enumerate($analysis_type.cond_bam_input.input): + ln -s -f ${bam} input_${i}.bam && + ln -s -f ${bam.metadata.bam_index} input_${i}.bam.bai && + #end for + #else + ln -s -f ${analysis_type.cond_bam_input.input} input.bam && + ln -s -f ${analysis_type.cond_bam_input.input.metadata.bam_index} input.bam.bai && + #end if +]]> + </token> + <token name="@token_bam_input@"> +<![CDATA[ + #if $analysis_type.cond_bam_input.all_in_one + #for $i, $bam in enumerate($analysis_type.cond_bam_input.input): + --input_file input_${i}.bam + #end for + #else + --input_file input.bam + #end if + @token_reference_input@ +]]> + </token> + + <token name="@token_reference_input@"> +<![CDATA[ +]]> + </token> + <xml name="macro_input" tokens="tag"> + <yield /> + </xml> + + <xml name="macro_optional_parameters"> + <conditional name="optional_parameters"> + <param name="optional_parameters_enabled" type="boolean" label="Configure Optional Parameters" /> + <when value="true"> + <yield /> + </when> + <when value="false" /> + </conditional> + </xml> + + <xml name="macro_advanced_parameters"> + <conditional name="advanced_parameters"> + <param name="advanced_parameters_enabled" type="boolean" label="Configure Advanced Parameters" /> + <when value="true"> + <yield /> + </when> + <when value="false" /> + </conditional> + </xml> + + <xml name="macro_tests"> + <tests> + + </tests> + </xml> + +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/generation/gatk.xsl Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,162 @@ +<?xml version="1.0" encoding="UTF-8"?> +<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> +<xsl:output + method="xml" + encoding="utf-8" + indent="yes" + cdata-section-elements="script style" /> + +<xsl:template match="/"> + +<tool id="gatk" name="GATK" version="@VERSION@.d9"> + <description>tool collection Version @VERSION@</description> + + <macros> + <import>gatk_macros.xml</import> + <xsl:for-each select="analyses/analysis"> + <import><xsl:value-of select="macro_file" /></import> + </xsl:for-each> + </macros> + + <expand macro="requirements" /> + + <stdio> + <regex match="^INFO" level="log" /> + <regex match="^WARN" level="warning" /> + <regex match="Using .* implementation of PairHMM" level="warning" /> + <regex match="There is insufficient memory for the Java Runtime Environment to continue" level="fatal" /> + <regex match="^##### ERROR" level="fatal" /> + <exit_code range="1:" level="fatal"/> + </stdio> + + <command> +<xsl:text disable-output-escaping="yes"><![CDATA[ + ############################ + ## import analysis specific preprocessings by using cheetahs internal searchList + ## if not defined, ignore + ############################ + #if $analysis_type.analysis_type_selector + "Preprocessing" in vars()['SL'][2] + #set $analysisPreprocessing = vars()['SL'][2][$analysis_type.analysis_type_selector + "Preprocessing"] + #include source=$analysisPreprocessing + #end if + + ############################ + ## GATK tool unspecific options + ############################ + @GATK_EXEC@ + + --analysis_type ${analysis_type.analysis_type_selector} + --reference_sequence ${ref_file.fields.path} + + --log_to_file ${output_log} + + #if $cond_intervals.cond_intervals_enabled + #for $interval in $cond_intervals.intervals: + --intervals ${interval.L} + #end for + #end if + + #if $cond_BQSR.cond_BQSR_enabled + --BQSR $cond_BQSR.BQSR + #end if + + ############################ + ## import analysis specific options by using cheetahs internal searchList + ## if not defined throw raw python error until better idea + ############################ + #if $analysis_type.analysis_type_selector + "Options" in vars()['SL'][2] + #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] + #include source=$analysisOptions + #else + #set $analysisOptions = vars()['SL'][2][$analysis_type.analysis_type_selector + "Options"] + #end if + + ############################ + ## only put ERROR or FATAL log messages into stderr + ## but keep full log for printing into log file + ############################ + 2>&1 | awk '\$1 != "INFO" && \$1 != "WARN"' >&2 +]]></xsl:text> + </command> + + <inputs> + + <param name="ref_file" type="select" label="Using reference genome" help="-R,‑‑reference_sequence &lt;reference_sequence&gt;" > + <options from_data_table="picard_indexes"> + <!--filter type="data_meta" key="dbkey" ref="@TAG@_input" column="dbkey" /--> + </options> + <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> + </param> + + <conditional name="cond_intervals"> + <param name="cond_intervals_enabled" type="boolean" label="Select interval subset to operate on?" /> + <when value="true"> + <repeat name="intervals" title="genomic interval over which to operate" help="-L,‑‑intervals &lt;intervals&gt;"> + <param name="L" type="text" value="" /> + </repeat> + </when> + <when value="false" /> + </conditional> + + <conditional name="cond_BQSR"> + <param name="cond_BQSR_enabled" type="boolean" label="Select covariates for on-the-fly recalibration?" /> + <when value="true"> + <param name="BQSR" type="data" format="tabular" label="Input covariates table file for on-the-fly base quality score recalibration" help="-BQSR,‑‑BQSR &lt;BQSR&gt; intended primarily for use with BaseRecalibrator and PrintReads" /> + </when> + <when value="false" /> + </conditional> + + <conditional name="cond_threads"> + <param name="cond_threads_enabled" type="boolean" label="Set computational options (cpu, mem)?" /> + <when value="true"> + <param name="nt" type="integer" value="1" label="Number of data threads to allocate to this analysis" help="make sure, the option is available for the chosen tool" /> + <param name="nct" type="integer" value="1" label="Number of CPU threads to allocate per data thread" help="make sure, the option is available for the chosen tool" /> + <param name="mem" type="integer" value="0" label="Overwrite Memory in MB (0 = don't overwrite)" help="Overwrites all other defaults and might lead to crash the run. States mem per data thread" /> + </when> + <when value="false" /> + </conditional> + + <conditional name="analysis_type"> + <param name="analysis_type_selector" type="select" label="Analysis Type"> + <xsl:for-each select="analyses/analysis"> + <option value="{name}"><xsl:value-of select="name" /></option> + </xsl:for-each> + </param> + <xsl:for-each select="analyses/analysis"> + <when value="{name}"> + <!--xsl:choose> + <xsl:when test="input_type = 'bam'"> + <expand macro="macro_bam_input" tag="{tag}" /> + </xsl:when> + <xsl:when test="input_type = 'gvcf'"> + <expand macro="macro_gvcf_input" tag="{tag}" /> + </xsl:when> + </xsl:choose--> + <expand macro="{name}Parameters" tag="{tag}" /> + </when> + </xsl:for-each> + </conditional> + </inputs> + + <outputs> + <xsl:for-each select="analyses/analysis"> + <expand macro="{name}Output" tag="{tag}"> + <filter>analysis_type['analysis_type_selector'] == '<xsl:value-of select="name" />'</filter> + </expand> + </xsl:for-each> + <data format="txt" name="output_log" label="${{tool.name}} - ${{analysis_type.analysis_type_selector}} on ${{on_string}} (log)" /> + </outputs> + + <expand macro="macro_tests" /> + + <citations> + <citation type="doi">10.1101/gr.107524.110</citation> + <citation type="doi">10.1038/ng.806</citation> + <citation type="doi">10.1002/0471250953.bi1110s43</citation> + </citations> +</tool> + +</xsl:template> +</xsl:stylesheet> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/generation/gatk.xsldb.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,57 @@ +<?xml version="1.0" encoding="UTF-8"?> +<analyses> + <analysis> + <name>RealignerTargetCreator</name> + <input_type>bam</input_type> + <tag>rtc</tag> + <macro_file>realigner_target_creator.xml</macro_file> + </analysis> + <analysis> + <name>IndelRealigner</name> + <input_type>bam</input_type> + <tag>ir</tag> + <macro_file>indel_realigner.xml</macro_file> + </analysis> + <analysis> + <name>BaseRecalibrator</name> + <input_type>bam</input_type> + <tag>br</tag> + <macro_file>base_recalibrator.xml</macro_file> + </analysis> + <analysis> + <name>AnalyzeCovariates</name> + <input_type>bam</input_type> + <tag>ac</tag> + <macro_file>analyze_covariates.xml</macro_file> + </analysis> + <analysis> + <name>PrintReads</name> + <input_type>bam</input_type> + <tag>pr</tag> + <macro_file>print_reads.xml</macro_file> + </analysis> + <analysis> + <name>HaplotypeCaller</name> + <input_type>bam</input_type> + <tag>hc</tag> + <macro_file>haplotype_caller.xml</macro_file> + </analysis> + <analysis> + <name>GenotypeGVCFs</name> + <input_type>gvcf</input_type> + <tag>gg</tag> + <macro_file>genotype_gvcfs.xml</macro_file> + </analysis> + <analysis> + <name>CombineGVCFs</name> + <input_type>gvcf</input_type> + <tag>cg</tag> + <macro_file>combine_gvcfs.xml</macro_file> + </analysis> + <analysis> + <name>CombineVariants</name> + <input_type>vcf</input_type> + <tag>cv</tag> + <macro_file>combine_variants.xml</macro_file> + </analysis> +</analyses>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/genotype_gvcfs.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,43 @@ +<macros> + <xml name="GenotypeGVCFsParameters" tokens="tag"> + + <expand macro="macro_gvcf_input" tag="@TAG@" /> + + <expand macro="macro_optional_parameters"> + + + <param name="sample_ploidy" type="integer" value="2" label="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy)" help="-ploidy,‑‑sample_ploidy &lt;sample_ploidy&gt;" /> + + </expand> + + </xml> + + <xml name="GenotypeGVCFsOutput"> + <data format="vcf" name="gg_output_gvcf" from_work_dir="output.g.vcf" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (gVCF)"> + <yield /> + </data> + </xml> + + <template name="GenotypeGVCFsPreprocessing"> +<![CDATA[ + @token_gvcf_input_pre@ +]]> + </template> + + <template name="GenotypeGVCFsOptions"> +<![CDATA[ + --out output.g.vcf + + @token_gvcf_input@ + + #set $optionals = $analysis_type.optional_parameters + #if $optionals.optional_parameters_enabled + --sample_ploidy $optionals.sample_ploidy + #end if +]]> + </template> + + +</macros> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/haplotype_caller.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,65 @@ +<macros> + <xml name="HaplotypeCallerParameters" tokens="tag"> + + <expand macro="macro_bam_input" tag="@TAG@" /> + + <conditional name="cond_usage"> + <param name="cond_usage_selector" type="select" label="Select usage"> + <option value="GVCF">Single-sample all-sites calling on DNAseq (GVCF mode)</option> + </param> + <when value="GVCF"> + <expand macro="HaplotypeCallerGVCF" /> + </when> + </conditional> + + <expand macro="macro_optional_parameters"> + + <param name="sample_ploidy" type="integer" value="2" label="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy)" help="-ploidy,‑‑sample_ploidy &lt;sample_ploidy&gt;" /> + + </expand> + + </xml> + + <xml name="HaplotypeCallerOutput"> + <data format="vcf" name="hc_output_gvcf" from_work_dir="output.g.vcf" label="${tool.name} on ${on_string} (gVCF)"> + <yield /> + </data> + </xml> + + <template name="HaplotypeCallerPreprocessing"> +<![CDATA[ + @token_bam_input_pre@ +]]> + </template> + + <template name="HaplotypeCallerOptions"> +<![CDATA[ + --out output.g.vcf + + @token_bam_input@ + + #set $optionals = $analysis_type.optional_parameters + #if $optionals.optional_parameters_enabled + --sample_ploidy $optionals.sample_ploidy + #end if + + #set $usage_selector = $analysis_type.cond_usage.cond_usage_selector + #set $usage = $analysis_type.cond_usage + + #if str($usage_selector) == 'GVCF' + --emitRefConfidence "GVCF" + #end if +]]> + </template> + + + + <xml name="HaplotypeCallerGVCF"> + <param name="emitRefConfidence" type="select" optional="true" label="Mode for emitting reference confidence scores" help="-ERC,‑‑emitRefConfidence &lt;emitRefConfidence&gt;"> + <option value="GVCF">GVCF (Reference model emitted with condensed non-variant blocks)</option> + </param> + </xml> + +</macros> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/indel_realigner.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,90 @@ +<macros> + <xml name="IndelRealignerParameters" tokens="tag"> + + <expand macro="macro_bam_input" tag="@TAG@" /> + + <param name="targetIntervals" type="data" format="gatk_interval" label="Intervals file output from RealignerTargetCreator" help="-targetIntervals,--targetIntervals &lt;targetIntervals&gt;" /> + + <expand macro="macro_optional_parameters"> + <repeat name="knownAlleles" title="Input VCF file(s) with known indels" help="-known,‑‑knownAlleles &lt;knownAlleles&gt;"> + <param name="knownAllele" type="data" format="vcf" label="Variant file (VCF format)" /> + </repeat> + + <param name="consensusDeterminationModel" type="select" label="minimum reads at a locus to enable using the entropy calculation" help="-model,‑‑consensusDeterminationModel &lt;consensusDeterminationModel&gt;"> + <option value="USE_READS">USE_READS - Additionally uses indels already present in the original alignments of the reads</option> + <option value="KNOWNS_ONLY">KNOWNS_ONLYS - Uses only indels from a provided ROD of known indels</option> + <option value="USE_SW">USE_SW - Additionally uses 'Smith-Waterman' to generate alternate consenses</option> + </param> + <param name="LODThresholdForCleaning" type="float" value="5.0" label="LOD threshold above which the cleaner will clean" help="-LOD,‑‑LODThresholdForCleaning &lt;LODThresholdForCleaning&gt;" /> + <!--param name="nWayOut" type="float" value="5.0" label="Generate one output file for each input (-I) bam file (not compatible with -output)" help="-nWayOut,--nWayOut &lt;nWayOut&gt;" /--> + </expand> + + + <expand macro="macro_advanced_parameters"> + <param name="entropyThreshold" type="float" value="0.15" label="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)" help="-entropy,‑‑entropyThreshold &lt;entropyThreshold&gt;" /> + + <param name="maxConsensuses" type="integer" value="30" label="Max alternate consensuses to try (necessary to improve performance in deep coverage)" help="-maxConsensuses,‑‑maxConsensuses &lt;maxConsensuses&gt;" /> + + <param name="maxIsizeForMovement" type="integer" value="3000" label="maximum insert size of read pairs that we attempt to realign" help="-maxIsize,‑‑maxIsizeForMovement &lt;maxIsizeForMovement&gt;" /> + + <param name="maxPositionalMoveAllowed" type="integer" value="200" label="Maximum positional move in basepairs that a read can be adjusted during realignment" help="-maxPosMove,‑‑maxPositionalMoveAllowed &lt;maxPositionalMoveAllowed&gt;" /> + + <param name="maxReadsForConsensuses" type="integer" value="120" label="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)" help="-greedy,‑‑maxReadsForConsensuses &lt;maxReadsForConsensuses&gt;" /> + + <param name="maxReadsForRealignment" type="integer" value="20000" label="Max reads allowed at an interval for realignment" help="-maxReads,‑‑maxReadsForRealignment &lt;maxReadsForRealignment&gt;" /> + + <param name="maxReadsInMemory" type="integer" value="150000" label="max reads allowed to be kept in memory at a time by the SAMFileWriter" help="-maxInMemory,‑‑maxReadsInMemory &lt;maxReadsInMemory&gt;" /> + + <param name="noOriginalAlignmentTags" type="boolean" truevalue="--noOriginalAlignmentTags" falsevalue="" label="Don't output the original cigar or alignment start tags for each realigned read in the output bam" help="-noTags,‑‑noOriginalAlignmentTags" /> + </expand> + + </xml> + + <xml name="IndelRealignerOutput"> + <data format="bam" name="ir_output_bam" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (BAM)"> + <yield /> + </data> + </xml> + + <template name="IndelRealignerPreprocessing"> +<![CDATA[ + @token_bam_input_pre@ + + ln -s -f ${analysis_type.targetIntervals} target.intervals && + + #if $analysis_type.optional_parameters.optional_parameters_enabled + #for $i, $knownAllele in enumerate($analysis_type.optional_parameters.knownAlleles): + ln -s -f ${knownAllele.knownAllele} knownAllele_${i}.vcf && + #end for + #end if +]]> + </template> + + <template name="IndelRealignerOptions"> +<![CDATA[ + --out ${ir_output_bam} + + @token_bam_input@ + --targetIntervals target.intervals + + #if $analysis_type.optional_parameters.optional_parameters_enabled + #for $i, $knownAllele in enumerate($analysis_type.optional_parameters.knownAlleles): + --knownAlleles knownAllele_${i}.vcf + #end for + --consensusDeterminationModel ${analysis_type.consensusDeterminationModel} + --LODThresholdForCleaning ${analysis_type.LODThresholdForCleaning} + #end if + + #if $analysis_type.advanced_parameters.advanced_parameters_enabled + --entropyThreshold ${analysis_type.advanced_parameters.entropyThreshold} + --maxConsensuses ${analysis_type.advanced_parameters.maxConsensuses} + --maxIsizeForMovement ${analysis_type.advanced_parameters.maxIsizeForMovement} + --maxPositionalMoveAllowed ${analysis_type.advanced_parameters.maxPositionalMoveAllowed} + --maxReadsForConsensuses ${analysis_type.advanced_parameters.maxReadsForConsensuses} + --maxReadsForRealignment ${analysis_type.advanced_parameters.maxReadsForRealignment} + --maxReadsInMemory ${analysis_type.advanced_parameters.maxReadsInMemory} + ${analysis_type.advanced_parameters.noOriginalAlignmentTags} + #end if +]]> + </template> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/print_reads.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,71 @@ +<macros> + <xml name="PrintReadsParameters" tokens="tag"> + + <expand macro="macro_bam_input" tag="@TAG@" /> + + <!-- BQSR in main config --> + + <expand macro="macro_optional_parameters"> + + <param name="number" type="integer" value="" optional="true" label="Print the first n reads from the file, discarding the rest" help="-n,‑‑number &lt;number&gt;" /> + + <param name="platform" type="text" value="" optional="true" label="Exclude all reads with this platform from the output" help="-platform,‑‑platform &lt;platform&gt;" /> + + <param name="readGroup" type="text" value="" optional="true" label="Exclude all reads with this read group from the output" help="-readGroup,‑‑readGroup &lt;readGroup&gt;" /> + + <param name="sample_file" type="data" format="txt" optional="true" label="File containing a list of samples (one per line). Can be specified multiple times" help="-sf,‑‑sample_file &lt;sample_file&gt;" /> + + <repeat name="sample_names" title="Sample names to be included in the analysis" help="-sn,‑‑sample_name &lt;sample_name&gt;"> + <param name="sample_name" type="text" value="" title="Sample name to be included in the analysis" /> + </repeat> + + <param name="simplify" type="text" truevalue="-s" falsevalue="" label="Erase all extra attributes in the read but keep the read group information" help="-s,‑‑simplify" /> + + </expand> + + </xml> + + <xml name="PrintReadsOutput"> + <data format="bam" name="pr_output_bam" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (BAM)"> + <yield /> + </data> + </xml> + + <template name="PrintReadsPreprocessing"> +<![CDATA[ + @token_bam_input_pre@ +]]> + </template> + + <template name="PrintReadsOptions"> +<![CDATA[ + --out ${pr_output_bam} + + @token_bam_input@ + + #set $optionals = $analysis_type.optional_parameters + #if $optionals.optional_parameters_enabled + #if int($optionals.number) > 0 + --number $optionals.number + #end if + #if str($optionals.platform) + --platform $optionals.platform + #end if + #if str($optionals.readGroup) + --readGroup $optionals.readGroup + #end if + #if $optionals.sample_file + --sample_file $optionals.sample_file + #end if + #if $optionals.sample_names + #for $sample in $optionals.sample_names: + --intervals ${sample.sample_name} + #end for + #end if + $optionals.simplify + #end if +]]> + </template> +</macros> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/realigner_target_creator.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,57 @@ +<macros> + <xml name="RealignerTargetCreatorParameters" tokens="tag"> + + <expand macro="macro_bam_input" tag="@TAG@" /> + + <expand macro="macro_optional_parameters"> + <param name="maxIntervalSize" type="integer" value="500" label="maximum interval size; any intervals larger than this value will be dropped" help="-maxInterval,‑‑maxIntervalSize &lt;maxIntervalSize&gt;" /> + <param name="minReadsAtLocus" type="integer" value="4" label="minimum reads at a locus to enable using the entropy calculation" help="-minReads,‑‑minReadsAtLocus &lt;minReadsAtLocus&gt;" /> + <param name="windowSize" type="integer" value="10" label="window size for calculating entropy or SNP clusters" help="-window,‑‑windowSize &lt;windowSize&gt;" /> + + <param name="mismatchFraction" type="float" value="0.0" label="fraction of base qualities needing to mismatch for a position to have high entropy" help="-mismatch,‑‑mismatchFraction &lt;mismatchFraction&gt;" /> + <repeat name="rod_bindings" title="Input VCF files with known indels" help="-known,‑‑known &lt;known&gt;"> + <param name="input_rod" type="data" format="vcf" label="Variant file (VCF format)" /> + </repeat> + </expand> + + </xml> + + <xml name="RealignerTargetCreatorOutput"> + <data format="gatk_interval" name="rtc_output_intervals" label="${tool.name} - ${analysis_type.analysis_type_selector} on ${on_string} (GATK intervals)"> + <yield /> + </data> + </xml> + + <template name="RealignerTargetCreatorPreprocessing"> +<![CDATA[ + @token_bam_input_pre@ + + #if $analysis_type.optional_parameters.optional_parameters_enabled + #for $i, $rod_binding in enumerate($analysis_type.optional_parameters.rod_bindings): + ln -s -f ${rod_binding.input_rod} rod_binding_${i}.vcf && + #end for + #end if +]]> + </template> + + <template name="RealignerTargetCreatorOptions"> +<![CDATA[ + --out ${rtc_output_intervals} + + @token_bam_input@ + + #if $analysis_type.optional_parameters.optional_parameters_enabled + --maxIntervalSize ${analysis_type.optional_parameters.maxIntervalSize} + --minReadsAtLocus ${analysis_type.optional_parameters.minReadsAtLocus} + --windowSize ${analysis_type.optional_parameters.windowSize} + --mismatchFraction ${analysis_type.optional_parameters.mismatchFraction} + + #for $i, $rod_binding in enumerate($analysis_type.optional_parameters.rod_bindings): + --known rod_binding_${i}.vcf + #end for + #end if +]]> + </template> +</macros> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/tool-data/destinations.py Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,62 @@ +from galaxy.jobs import JobDestination +import os +import sys +import json +import cStringIO +import logging + +log = logging.getLogger( __name__ ) + + +def dump(obj, nested_level=0, output=sys.stdout): + spacing = ' ' + if type(obj) == dict: + print >> output, '%s{' % ((nested_level) * spacing) + for k, v in obj.items(): + if hasattr(v, '__iter__'): + print >> output, '%s%s:' % ((nested_level + 1) * spacing, k) + dump(v, nested_level + 1, output) + else: + print >> output, '%s%s: %s' % ((nested_level + 1) * spacing, k, v) + print >> output, '%s}' % (nested_level * spacing) + elif type(obj) == list: + print >> output, '%s[' % ((nested_level) * spacing) + for v in obj: + if hasattr(v, '__iter__'): + dump(v, nested_level + 1, output) + else: + print >> output, '%s%s' % ((nested_level + 1) * spacing, v) + print >> output, '%s]' % ((nested_level) * spacing) + else: + print >> output, '%s%s' % (nested_level * spacing, obj) + + +def dynamic_slurm_cluster_gatk(job, tool_id): + # Allocate extra time + inp_data = dict( [ ( da.name, da.dataset ) for da in job.input_datasets ] ) + inp_data.update( [ ( da.name, da.dataset ) for da in job.input_library_datasets ] ) + inp_data.update( [ ( da.name, json.loads(da.value) ) for da in job.parameters ] ) + out = cStringIO.StringIO() + dump(inp_data, 1, out) + log.debug(out.getvalue()) + + nativeSpecs = '--nodes=1 --ntasks=1' + + # runner doesn't allow to specify --cpus-per-task + # thus the mem calculation gets messy with more than 1 node + # --> translate nt ==> nodes, nct ==> ntasks + + if 'cond_threads' not in inp_data: + return JobDestination(runner="slurm") + + if inp_data['cond_threads']['cond_threads_enabled'] == "True": + nNodes = int(inp_data['cond_threads']['nt']) + nCPU = int(inp_data['cond_threads']['nct']) + nMEM = int(inp_data['cond_threads']['mem']) + if nMEM > 0: + nativeSpecs = '--nodes=%d --ntasks=%d --mem=%d' % (nNodes, nCPU*nNodes, nMEM) + else: + nativeSpecs = '--nodes=%d --ntasks=%d' % (nNodes, nCPU*nNodes) + + return JobDestination(runner="slurm", params={"nativeSpecification": nativeSpecs}) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/tool-data/picard_index.loc.sample Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,26 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Picard dict and associated files. You will need +#to create these data files and then create a picard_index.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The picard_index.loc +#file has this format (longer white space is the TAB character): +# +#<unique_build_id> <dbkey> <display_name> <fasta_file_path> +# +#So, for example, if you had hg18 indexed and stored in +#/depot/data2/galaxy/srma/hg18/, +#then the srma_index.loc entry would look like this: +# +#hg18 hg18 hg18 Pretty /depot/data2/galaxy/picard/hg18/hg18.fa +# +#and your /depot/data2/galaxy/srma/hg18/ directory +#would contain the following three files: +#hg18.fa +#hg18.dict +#hg18.fa.fai +# +#The dictionary file for each reference (ex. hg18.dict) must be +#created via Picard (http://picard.sourceforge.net). Note that +#the dict file does not have the .fa extension although the +#path list in the loc file does include it. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/tool_data_table_conf.xml.sample Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,7 @@ +<tables> + <!-- Location of Picard dict files valid for GATK --> + <table name="picard_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/picard_index.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/gatk/tool_dependencies.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<tool_dependency> + <set_environment version="1.0"> + <environment_variable action="set_to" name="GATK_PATH">/mnt/galaxy/tools/GATK/3.4-0</environment_variable> + </set_environment> + <!-- + Use GATK_SITE_OPTIONS to set additional parameters that should be inserted in every GATK call. + The intended use case was to prohibit GATK to collect and send data. + For example: + + -et "NO_ET" -K "/data/gatk_key_file" ##ET no phone home + --> + <set_environment version="1.0"> + <environment_variable action="set_to" name="GATK_SITE_OPTIONS"> </environment_variable> + </set_environment> + <package name="package_r_for_gatk_3_4_0" version="3.1.2.1"> + <repository changeset_revision="49c62e9b71ad" name="package_r_for_gatk_3_4_0" owner="avowinkel" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/package_r_for_gatk_3_4_0/tool_dependencies.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,48 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="R" version="3.1.2"> + <repository changeset_revision="9f2fddb9d6e2" name="package_r_3_1_2" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="package_r_for_gatk_3_4_0" version="3.1.2.1"> + <install version="1.0"> + <actions> + <action type="setup_r_environment"> + + <repository changeset_revision="9f2fddb9d6e2" name="package_r_3_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu"> + <package name="R" version="3.1.2" /> + </repository> + <package>https://github.com/cran/stringi/archive/0.5-5.tar.gz</package> + <package>https://github.com/cran/magrittr/archive/1.5.tar.gz</package> + <package>https://github.com/cran/stringr/archive/1.0.0.tar.gz</package> + <package>https://github.com/cran/RColorBrewer/archive/1.1-2.tar.gz</package> + <package>https://github.com/cran/dichromat/archive/2.0-0.tar.gz</package> + <package>https://github.com/cran/colorspace/archive/1.2-6.tar.gz</package> + <package>https://github.com/cran/munsell/archive/0.4.2.tar.gz</package> + <package>https://github.com/cran/labeling/archive/0.3.tar.gz</package> + <package>https://github.com/cran/Rcpp/archive/0.11.6.tar.gz</package> + <package>https://github.com/cran/digest/archive/0.6.8.tar.gz</package> + <package>https://github.com/cran/gtable/archive/0.1.2.tar.gz</package> + <package>https://github.com/cran/bitops/archive/1.0-6.tar.gz</package> + <package>https://github.com/cran/caTools/archive/1.17.1.tar.gz</package> + <package>https://github.com/cran/gtools/archive/3.5.0.tar.gz</package> + <package>https://github.com/cran/gdata/archive/2.17.0.tar.gz</package> + <package>https://github.com/cran/gsalib/archive/2.1.tar.gz</package> + <package>https://github.com/cran/gplots/archive/2.17.0.tar.gz</package> + <package>https://github.com/cran/plyr/archive/1.8.3.tar.gz</package> + <package>https://github.com/cran/reshape/archive/0.8.5.tar.gz</package> + <package>https://github.com/cran/reshape2/archive/1.4.1.tar.gz</package> + <package>https://github.com/cran/scales/archive/0.2.5.tar.gz</package> + <package>https://github.com/cran/proto/archive/0.3-10.tar.gz</package> + <package>https://github.com/cran/MASS/archive/7.3-43.tar.gz</package> + <package>https://github.com/cran/ggplot2/archive/1.0.1.tar.gz</package> + </action> + </actions> + </install> + <readme> + ggplot2 is a plotting system for R, based on the grammar of graphics, which tries to take the good parts of base and lattice graphics and none of the bad parts. + It takes care of many of the fiddly details that make plotting a hassle (like drawing legends) as well as providing a powerful model of graphics that makes it easy to produce complex multi-layered graphics. + + http://ggplot2.org/ + </readme> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/suite_samtools_1_2/repository_dependencies.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,17 @@ +<?xml version="1.0"?> +<repositories description="A suite of Galaxy tools designed to work with version 1.2 of the SAMtools package."> + <repository changeset_revision="cf875cbe2df4" name="data_manager_sam_fasta_index_builder" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="af7c50162f0b" name="bam_to_sam" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="d04d9f1c6791" name="sam_to_bam" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="8c3472790020" name="samtools_bedcov" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="1ebb4ecdc1ef" name="samtools_calmd" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="0072bf593791" name="samtools_flagstat" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="87398ae795c7" name="samtools_idxstats" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="c6fdfe3331d6" name="samtools_mpileup" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="020e144b5f78" name="samtools_reheader" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="3735f950b2f5" name="samtools_rmdup" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="2b474ebbfc7d" name="samtools_slice_bam" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="a430da4f04cd" name="samtools_sort" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="57f3e32f809d" name="samtools_split" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="0d71d9467847" name="samtools_stats" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> +</repositories>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/BAM_preprocessingPairs.pl Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,340 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; +use Getopt::Std; +my $version = '0.5_galaxy'; + +my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; + +my %opts = ( t=>1, p=>1, n=>1000000, f=>3, s=>0, S=>10000, o=>"." ); + +getopts('dt:p:n:f:s:S:o:b:l:x:N:', \%opts); #GALAXY + +my $working_dir=($opts{o} ne ".")? $opts{o}:"working directory"; + +my $pt_bad_mates_file=$opts{b}; #GALAXY +my $pt_log_file=$opts{l}; #GALAXY +my $pt_good_mates_file=$opts{x} if($opts{d}); #GALAXY + + +die(qq/ + +Description: + + Preprocessing of mates to get anomalously mapped mate-pair\/paired-end reads as input + for SVDetect. + + From all pairs mapped onto the reference genome, this script outputs abnormal pairs: + - mapped on two different chromosomes + - with an incorrect strand orientation and\/or pair order + - with an insert size distance +- sigma threshold + into a file <prefix.ab.bam\/sam> + + -BAM\/SAM File input format only, sorted by read names + + Version : $version + SAMtools required for BAM files + + +Usage: BAM_preprocessingPairs.pl [options] <all_mate_file.sorted.bam\/sam> + +Options: -t BOOLEAN read type: =1 (Illumina), =0 (SOLiD) [$opts{t}] + -p BOOLEAN pair type: =1 (paired-end), =0 (mate-pair) [$opts{p}] + -n INTEGER number of pairs for calculating mu and sigma lengths [$opts{n}] + -s INTEGER minimum value of ISIZE for calculating mu and sigma lengths [$opts{s}] + -S INTEGER maximum value of ISIZE for calculating mu and sigma lengths [$opts{S}] + -f REAL minimal number of sigma fold for filtering pairs [$opts{f}] + -d dump normal pairs into a file [<prefix.norm.bam\/sam>] (optional) + -o STRING output directory [$working_dir] + +\n/) if (@ARGV == 0 && -t STDIN); + +unless (-d $opts{o}){ + mkdir $opts{o} or die; +} +$opts{o}.="/" if($opts{o}!~/\/$/); + +my $mates_file=shift(@ARGV); + +$mates_file=readlink($mates_file); + +my $bad_mates_file=(split(/\//,$mates_file))[$#_]; + +if($bad_mates_file=~/.(s|b)am$/){ + $bad_mates_file=~s/.(b|s)am$/.ab.sam/; + $bad_mates_file=$opts{o}.$bad_mates_file; +} + +else{ + die "Error: mate_file with the extension <.bam> or <.sam> needed !\n"; +} + +my $good_mates_file; +if($opts{d}){ + $good_mates_file=(split(/\//,$mates_file))[$#_]; + $good_mates_file=~s/.(b|s)am$/.norm.sam/; + $good_mates_file=$opts{o}.$good_mates_file; +} + +my $log_file=$opts{o}.$opts{N}.".svdetect_preprocessing.log"; #GALAXY + +#------------------------------------------------------------------------------# +#Calculate mu and sigma + +open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n"; + +print LOG "\# Calculating mu and sigma lengths...\n"; +print LOG "-- file=$mates_file\n"; +print LOG "-- n=$opts{n}\n"; +print LOG "-- ISIZE min=$opts{s}, max=$opts{S}\n"; + +my ($record, $sumX,$sumX2) = (0,0,0); +my $warn=$opts{n}/10; +my $prev_pair="FIRST"; + +my $bam=($mates_file =~ /.bam$/)? 1:0; + +if($bam){ + open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; +}else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; +} + +while(<MATES>){ + + my @t=split; + + next if ($t[0]=~/^@/); + + my $current_pair=$t[0]; + next if($current_pair eq $prev_pair); + $prev_pair=$current_pair; + + my ($chr1,$chr2,$length)=($t[2],$t[6],abs($t[8])); + + next if (($t[1]&0x0004) || ($t[1]&0x0008)); + next if ($length<$opts{s} || $length>$opts{S}) ; + + if($chr2 eq "="){ + + $sumX += $length; #add to sum and sum^2 for mean and variance calculation + $sumX2 += $length*$length; + $record++; + } + + if($record>$warn){ + print LOG "-- $warn pairs analysed\n"; + $warn+=$warn; + } + + last if ($record>$opts{n}); + +} +close (MATES); + +$record--; +my $mu = $sumX/$record; +my $sigma = sqrt($sumX2/$record - $mu*$mu); + +print LOG "-- Total : $record pairs analysed\n"; +print LOG "-- mu length = ".decimal($mu,1).", sigma length = ".decimal($sigma,1)."\n"; + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Preprocessing pairs + +$warn=100000; + +$record=0; +my %count=( ab=>0, norm=>0, chr=>0, sense=>0, dist=>0, unmap=>0); + +my $read_type=($opts{t})? "Illumina":"SOLiD"; +my $pair_type=($opts{p})? "paired-end":"mate-paired"; + +print LOG "\# Preprocessing pairs...\n"; +print LOG "-- file= $mates_file\n"; +print LOG "-- type= $read_type $pair_type reads\n"; +print LOG "-- sigma threshold= $opts{f}\n"; +print LOG "-- using ".decimal($mu-$opts{f}*$sigma,4)."-".decimal($mu+$opts{f}*$sigma,4)." as normal range of insert size\n"; + +my @header; + +if($bam){ + open(HEADER, "${SAMTOOLS_BIN_DIR}/samtools view -H $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; + @header=<HEADER>; + close HEADER; + open(MATES, "${SAMTOOLS_BIN_DIR}/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; +}else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; +} + +open AB, ">$bad_mates_file" or die "$0: can't write in the output: $bad_mates_file :$!\n"; +print AB @header if($bam); + +if($opts{d}){ + open NORM, ">$good_mates_file" or die "$0: can't write in the output: $good_mates_file :$!\n"; + print NORM @header if($bam); +} + +$prev_pair="FIRST"; +my $prev_bad; + +while(<MATES>){ + + my @t=split; + my $bad=0; + + if ($t[0]=~/^@/){ + print AB; + print NORM if ($opts{d}); + next; + } + + my $current_pair=$t[0]; + if($current_pair eq $prev_pair){ + next if($prev_bad==-1); + if($prev_bad){ + print AB; + }elsif(!$prev_bad){ + print NORM if($opts{d}); + } + next; + } + + $prev_pair=$current_pair; + + my ($chr1,$chr2,$pos1,$pos2,$length)=($t[2],$t[6],$t[3],$t[7], abs($t[8])); + + if (($t[1]&0x0004) || ($t[1]&0x0008)){ + $prev_bad=-1; + $count{unmap}++; + $record++; + next; + + } + + my $strand1 = (($t[1]&0x0010))? 'R':'F'; + my $strand2 = (($t[1]&0x0020))? 'R':'F'; + my $order1 = (($t[1]&0x0040))? '1':'2'; + my $order2 = (($t[1]&0x0080))? '1':'2'; + + if($order1 == 2){ + ($strand1,$strand2)=($strand2,$strand1); + ($chr1,$chr2)=($chr2,$chr1); + ($pos1,$pos2)=($pos2,$pos1); + ($order1,$order2)=($order2,$order1); + } + + my $sense=$strand1.$strand2; + + if($chr1 ne "=" && $chr2 ne "="){ + $bad=1; + $count{chr}++; + } + + if($opts{p}){ #paired-end + if(!(($sense eq "FR" && $pos1<$pos2) || ($sense eq "RF" && $pos2<$pos1))){ + $bad=1; + $count{sense}++; + } + }else{ #mate-pair + if($opts{t}){ #Illumina + if(!(($sense eq "FR" && $pos2<$pos1) || ($sense eq "RF" && $pos1<$pos2))){ + $bad=1; + $count{sense}++; + } + }else{ #SOLiD + if(!(($sense eq "FF" && $pos2<$pos1) || ($sense eq "RR" && $pos1<$pos2))){ + $bad=1; + $count{sense}++; + } + } + } + + if(($chr1 eq "=" || $chr2 eq "=") && ($length <$mu - $opts{f}*$sigma || $length>$mu + $opts{f}*$sigma)){ + $bad=1; + $count{dist}++; + } + + if($bad){ + print AB; + $count{ab}++; + $prev_bad=$bad; + }else{ + print NORM if ($opts{d}); + $count{norm}++; + $prev_bad=$bad; + } + + $record++; + + if($record>$warn){ + print LOG "-- $warn pairs analysed\n"; + $warn+=100000; + } +} + +close AB; +close NORM if($opts{d}); + +print LOG "-- Total : $record pairs analysed\n"; +print LOG "-- $count{unmap} pairs whose one or both reads are unmapped\n"; +print LOG "-- ".($count{ab}+$count{norm})." mapped pairs\n"; +print LOG "---- $count{ab} abnormal mapped pairs\n"; +print LOG "------ $count{chr} pairs mapped on two different chromosomes\n"; +print LOG "------ $count{sense} pairs with incorrect strand orientation and\/or pair order\n"; +print LOG "------ $count{dist} pairs with incorrect insert size distance\n"; +print LOG "--- $count{norm} correct mapped pairs\n"; + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#OUTPUT + +if($bam){ + + my $bam_file=$bad_mates_file; + $bam_file=~s/.sam$/.bam/; + print LOG "\# Converting sam to bam for abnormal mapped pairs\n"; + system("${SAMTOOLS_BIN_DIR}/samtools view -bS $bad_mates_file > $bam_file 2>".$opts{o}."samtools.log"); + unlink($bad_mates_file); + print LOG "-- output created: $bam_file\n"; + + system "rm $pt_bad_mates_file ; ln -s $bam_file $pt_bad_mates_file"; #GALAXY + + if($opts{d}){ + $bam_file=$good_mates_file; + $bam_file=~s/.sam$/.bam/; + print LOG "\# Converting sam to bam for correct mapped pairs\n"; + system("${SAMTOOLS_BIN_DIR}/samtools view -bS $good_mates_file > $bam_file 2>".$opts{o}."samtools.log"); + unlink($good_mates_file); + print LOG "-- output created: $bam_file\n"; + + system "rm $pt_good_mates_file ; ln -s $bam_file $pt_good_mates_file"; #GALAXY + + } + +} + +else{ + print LOG "-- output created: $bad_mates_file\n"; + print LOG "-- output created: $good_mates_file\n" if($opts{d}); +} + +close LOG; + +system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub decimal{ + + my $num=shift; + my $digs_to_cut=shift; + + $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); + + return $num; +} +#------------------------------------------------------------------------------#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/BAM_preprocessingPairs.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,77 @@ +<tool id="svdetect_preprocessing" name="BAM preprocessing"> + + <description>to get abnormal pairs</description> + + <command interpreter="perl"> BAM_preprocessingPairs.pl -t '$readType' -p '$pairType' -n '$nbrePair' -s '$isizeMin' -S '$isizeMax' -f '$foldPair' -o $__new_file_path__/svdetect -b '$abBAM' -l '$log' -N $sample_name + #if $newBam.pairNormal=="yes" + -d -x '$normBAM' + #end if + '$inputBam' + </command> + + <inputs> + <param name="sample_name" type="text" value="sample" label="Sample Name"/> + <param name="inputBam" type="data" format="bam" label="BAM input file"/> + <param name="readType" type="select" label="Read type"> + <option value="1">Illumina</option> + <option value="0">SOLiD</option> + </param> + <param name="pairType" type="select" label="Library type"> + <option value="1">Paired-end</option> + <option value="0">Mate-Pair</option> + </param> + <conditional name="newBam"> + <param name="pairNormal" type="select" label="Do you want an additional bam file listing concordant mapped pairs?" help="Dump normal pairs into a file sample_name.norm.bam/sam"> + <option value="no">No</option> + <option value="yes">Yes</option> + </param> + <when value="yes"> + <!-- do nothing here --> + </when> + <when value="no"> + <!-- do nothing here --> + </when> + </conditional> + <param name="nbrePair" value="1000000" type="integer" size="30" label="Number of pairs for calculating mu (µ) and sigma (σ) lengths"/> + <param name="isizeMin" value="0" type="integer" size="30" label="Minimum value of ISIZE for calculating mu (µ) and sigma (σ) lengths"/> + <param name="isizeMax" value="10000" type="integer" size="30" label="Maximum value of ISIZE for calculating mu (µ)and sigma( σ) lengths"/> + <param name="foldPair" value="3" type="float" size="30" label="Minimal number of sigma (σ) fold for filtering pairs"/> + </inputs> + + <outputs> + <data format="bam" name="abBAM" label="${$sample_name}.ab.bam"/> + <data format="txt" name="log" label="${$sample_name}.svdetect_preprocessing.log"/> + <data format="bam" name="normBAM" label="${$sample_name}.norm.bam"> + <filter>newBam['pairNormal'] == 'yes'</filter> + </data> + </outputs> + + <help> + +**What it does** + +Bam_preprocessingPairs - Version 0.5 + +Preprocessing of mates to get anomalously mapped mate-pair/paired-end reads as input for SVDetect. + +From all pairs mapped onto the reference genome, this script outputs abnormal pairs: + + * mapped on two different chromosomes + * with an incorrect strand orientation and/or pair order + * with an insert size distance +- sigma threshold + +into a file prefix.ab.bam/sam sorted by read names + +-BAM/SAM File input format only. + +SAMtools required for BAM files + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. + + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/SVDetect_compare.pl Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,716 @@ +#!/usr/bin/perl -w + +=pod + +=head1 NAME + +SVDetect Compare for Galaxy + +Version: 0.8b for Galaxy + +=head1 SYNOPSIS + +SVDetect_compare.pl links2compare -conf <configuration_file> [-help] [-man] + +=cut + +# ------------------------------------------------------------------- + +use strict; +use warnings; + +use Pod::Usage; +use Getopt::Long; + +use Config::General; +use Tie::IxHash; + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#PARSE THE COMMAND LINE +my %OPT; +GetOptions(\%OPT, + 'conf=s', + 'out1=s', #GALAXY + 'out2=s', #GALAXY + 'out3=s', #GALAXY + 'out4=s', #GALAXY + 'out5=s', #GALAXY + 'out6=s', #GALAXY + 'out7=s', #GALAXY + 'out8=s', #GALAXY + 'out9=s', #GALAXY + 'l=s', #GALAXY + 'N=s', #GALAXY + 'help', + 'man' + ); + +pod2usage() if $OPT{help}; +pod2usage(-verbose=>2) if $OPT{man}; +pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); + + +pod2usage() if(@ARGV<1); + +tie (my %func, 'Tie::IxHash',links2compare=>\&links2compare); + +foreach my $command (@ARGV){ + pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); +} +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#READ THE CONFIGURATION FILE +my $conf=Config::General->new( -ConfigFile => $OPT{conf}, + -Tie => "Tie::IxHash", + -AllowMultiOptions => 1, + -LowerCaseNames => 1, + -AutoTrue => 1); +my %CONF= $conf->getall; +validateconfiguration(\%CONF); #validation of the configuration parameters + + +my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY +my $BEDTOOLS_BIN_DIR="/bioinfo/local/BEDTools/bin"; #GALAXY + +my $pt_log_file=$OPT{l}; #GALAXY +my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_compare.log"; #GALAXY +open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY + +my @pt_sv_file=($OPT{out1},$OPT{out2},$OPT{out3}) if($OPT{out1}); #GALAXY common,sample,reference +my @pt_circos_file=($OPT{out4},$OPT{out5},$OPT{out6}) if($OPT{out4}); #GALAXY common,sample,reference +my @pt_bed_file=($OPT{out7},$OPT{out8},$OPT{out9}) if($OPT{out7}); #GALAXY common,sample,reference + +$CONF{compare}{sample_link_file}=readlink($CONF{compare}{sample_link_file});#GALAXY +$CONF{compare}{sample_link_file}=~s/.sv.txt//; #GALAXY + +$CONF{compare}{reference_link_file}=readlink($CONF{compare}{reference_link_file});#GALAXY +$CONF{compare}{reference_link_file}=~s/.sv.txt//; #GALAXY + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#COMMAND EXECUTION +foreach my $command (@ARGV){ + &{$func{$command}}(); +} +print LOG "-- end\n"; + +close LOG;#GALAXY +system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY + +exit(0); +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#FUNCTIONS + +# -----------------------------------------------------------------------------# +#MAIN FUNCTION number 5:Comparison between samples, common or specific links +sub links2compare{ + + my @compare_files; + + compareSamples($CONF{general}{output_dir}, + $CONF{compare}{list_samples}, + $CONF{compare}{sample_link_file}, + $CONF{compare}{reference_link_file}, + $CONF{compare}{min_overlap}, + $CONF{compare}{same_sv_type}, + \@compare_files); + + my $pt_ind=0; + + for my $input_file (@compare_files){ + + $input_file=$CONF{general}{output_dir}.$input_file; + + my $output_file=$input_file; + $output_file=~s/unique$/compared/; + + sortLinks($input_file, $output_file,1); + + if($CONF{compare}{circos_output}){ + links2segdup($CONF{circos}{organism_id}, + $CONF{circos}{colorcode}, + $output_file, + $output_file.".segdup.txt"); + system "rm $pt_circos_file[$pt_ind]; ln -s $output_file.segdup.txt $pt_circos_file[$pt_ind]" if (defined $pt_circos_file[$pt_ind]); #GALAXY + } + + if($CONF{compare}{bed_output}){ + links2bedfile($CONF{compare}{read_lengths}, + $CONF{bed}{colorcode}, + $output_file, + $output_file.".bed"); + system "rm $pt_bed_file[$pt_ind]; ln -s $output_file.bed $pt_bed_file[$pt_ind]" if (defined $pt_bed_file[$pt_ind]); #GALAXY + } + + if($CONF{compare}{sv_output}){ + + links2SVfile ($output_file, $output_file.".sv.txt"); + system "rm $pt_sv_file[$pt_ind]; ln -s $output_file.sv.txt $pt_sv_file[$pt_ind]" if (defined $pt_sv_file[$pt_ind]); #GALAXY + } + $pt_ind++; + + } + unlink(@compare_files); + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub compareSamples{ + + my ($dir,$list_samples,$sample_file,$reference_file,$min_overlap,$same_sv_type,$file_names)=@_; + + my @bedpefiles; + my @list=split(",",$list_samples); + my @list_files=($sample_file,$reference_file); + + print LOG "\# Comparison procedure...\n"; + print LOG "-- samples=$list_samples\n". + "-- minimum overlap=$min_overlap\n". + "-- same SV type=$same_sv_type\n"; + + #conversion of links to bedPE format file + print LOG "-- Conversion of links.filtered files to bedPE format\n"; + for my $s (0..$#list) { + + links2bedPElinksfile($list[$s],$list_files[$s],$list_files[$s].".bedpe.txt"); + push(@bedpefiles,$list_files[$s].".bedpe.txt"); + + } + + #get common links between all samples compared + print LOG "-- Getting common links between all samples with BEDTools\n"; + my $common_name=join(".",@list); + + my $nb=scalar @list; + my $command=""; + my $prompt=">"; + + while ($nb>0){ + + for my $i (0..$#list_files){ + + $command.="$BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a ".$list_files[$i].".bedpe.txt"; + my $pipe=0; + + for my $j ($i+1..$#list_files){ + + $command.="| $BEDTOOLS_BIN_DIR/pairToPair -type both -f $min_overlap -a stdin" if($pipe); + $command.=" -b ".$list_files[$j].".bedpe.txt"; + $pipe=1; + + } + + $command.=$prompt.$dir.$common_name.".bedpe.tmp;"; + $prompt=">>"; + + my $first=shift(@list_files); push(@list_files,$first); + last; + } + $nb--; + } + + system ($command); + + push(@bedpefiles,$dir.$common_name.".bedpe.tmp"); + + #Post comparison to get common links if same type only (as an option) + open( FILE, "<".$dir.$common_name.".bedpe.tmp") or die "Can't open".$dir.$common_name.".bedpe.tmp : $!"; + open( OUT, ">".$dir.$common_name.".bedpe.unique") or die "Can't write in ".$dir.$common_name.".bedpe.unique : $!"; + + while(<FILE>){ + my @t=split("\t",$_); + my $s=(split("_",$t[6]))[0]; + my ($sv1,$sv2)=($t[7],$t[18]); + splice(@t,11,$#t); + + if($same_sv_type){ + print OUT join("\t",@t)."\n" if($sv1 eq $sv2); + }else{ + print OUT join("\t",@t)."\n"; + } + } + close FILE; + close OUT; + + bedPElinks2linksfile($dir.$common_name.".bedpe.unique", $dir.$common_name.".unique"); + push(@bedpefiles,$dir.$common_name.".bedpe.unique"); + push(@$file_names,$common_name.".unique"); + print LOG "-- output created: ".$dir.$common_name.".compared\n"; + + #get specific links for each sample + print LOG "-- Getting specific links for each sample\n"; + for my $s (0..$#list) { + system("grep -Fxv -f ".$dir.$common_name.".bedpe.unique ".$list_files[$s].".bedpe.txt >".$dir.$list[$s].".bedpe.unique"); + bedPElinks2linksfile($dir.$list[$s].".bedpe.unique",$dir.$list[$s].".unique"); + push(@bedpefiles,$dir.$list[$s].".bedpe.unique"); + push(@$file_names,$list[$s].".unique"); + print LOG "-- output created: ".$dir.$list[$s].".compared\n"; + } + + unlink(@bedpefiles); + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the circos format +sub links2segdup{ + + my($id,$color_code,$links_file,$segdup_file)=@_; + + print LOG "\# Converting to the circos format...\n"; + + tie (my %hcolor,'Tie::IxHash'); #color-code hash table + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + } + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; + + my $index=1; + while(<LINKS>){ + + my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; + + my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links + + print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output + "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; + $index++; + } + + close LINKS; + close SEGDUP; + print LOG "-- output created: $segdup_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bedPE format for BEDTools usage +sub links2bedPElinksfile{ + + my ($sample,$links_file,$bedpe_file)=@_; + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; + + my $nb_links=1; + + while(<LINKS>){ + + chomp; + my @t=split("\t",$_); + my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); + my $type=($chr1 eq $chr2)? "INTRA":"INTER"; + $type.="_".$t[10]; + + $start1--; $start2--; + + print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". + "\t$sample"."_link$nb_links\t$type\t.\t.". + "\t".join("|",@t)."\n"; + + $nb_links++; + } + + close LINKS; + close BEDPE; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub bedPElinks2linksfile{ + + my ($bedpe_file,$links_file)=@_; + + open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; + open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; + + while(<BEDPE>){ + + chomp; + my $sample=(split("_",(split("\t",$_))[6]))[0]; + my @t1=(split("\t",$_))[0,1,2,3,4,5]; + my @t2=split(/\|/,(split("\t",$_))[10]); + push(@t2,$sample); + + print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; + + } + close BEDPE; + close LINKS; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bed format +sub links2bedfile{ + + my ($tag_length,$color_code,$links_file,$bed_file)=@_; + + print LOG "\# Converting to the bed format...\n"; + + my $compare=1; + if($links_file!~/compared$/){ + $compare=0; + $tag_length->{none}->{1}=$tag_length->{1}; + $tag_length->{none}->{2}=$tag_length->{2}; + } + + #color-code hash table + tie (my %hcolor,'Tie::IxHash'); + my %color_order; + $color_order{"255,255,255"}=0; + my $n=1; + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + $color_order{$col}=$n; + $n++; + } + + my %pair; + my %pt; + $n=1; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + + my %str=( "F"=>"+", "R"=>"-" ); + + while(<LINKS>){ + + my @t=split; + my $sample=($compare)? pop(@t):"none"; + + my $chr1=$t[0]; + my $chr2=$t[3]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $same_chr=($chr1 eq $chr2)? 1:0; + + my $count=$t[6]; + my $color=getColor($count,\%hcolor,"bed"); + + my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); + my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); + my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); + my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); + my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); + my @position1=deleteBadOrderSensePairs(split(",",$t[14])); + my @position2=deleteBadOrderSensePairs(split(",",$t[15])); + my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); + my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); + + + for my $p (0..$#pairs){ + + if (!exists $pair{$pairs[$p]}){ + + if($same_chr){ + + $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, + $start1[$p]-1, $end2[$p], $color, + 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; + $pt{$n}=$pair{$pairs[$p]}->{0}; + $n++; + + }else{ + + $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, + $start1[$p]-1, $end1[$p], $color, + 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{1}; + $n++; + + + $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, + $start2[$p]-1, $end2[$p], $color, + 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{2}; + $n++; + } + }else{ + + if($same_chr){ + ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); + }else{ + ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); + ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); + } + } + } + } + close LINKS; + + my $nb_pairs=$n-1; + + open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; + print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". + "visibility=2 itemRgb=\"On\"\n"; + + for my $i (1..$nb_pairs){ + print BED join("\t",@{$pt{$i}})."\n"; + } + + close BED; + + print LOG "-- output created: $bed_file\n"; + + undef %pair; + undef %pt; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub links2SVfile{ + + my($links_file,$sv_file)=@_; + + print LOG "\# Converting to the sv output table...\n"; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist + chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering + final_score breakpoint1_start1-end1 breakpoint2_start2-end2); + + my $nb_links=0; + + while (<LINKS>){ + + my @t=split; + my @sv=(); + my $sv_type="-"; + my $strand_ratio="-"; + my $eq_ratio="-"; + my $eq_type="-"; + my $insert_ratio="-"; + my $link="-"; + my ($bk1, $bk2)=("-","-"); + my $score="-"; + + my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); + my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); + my $nb_pairs=$t[6]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; + + #if strand filtering + if (defined $t[16]){ + #if inter-chr link + $sv_type=$t[16]; + if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ + $strand_ratio=floor($1/$2*100)."%"; + $score=$t[18]; + } + if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ + #if intra-chr link with insert size filtering + $strand_ratio=floor($1/$2*100)."%"; + $link=floor($t[17]); + if($sv_type!~/^INV/){ + $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + $score=$t[20]; + }else{ + $score=$t[19]; + } + } + } + + if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ + + #if strand and order filtering only and/or interchr link + $eq_type=$t[18]; + $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + ($bk1, $bk2)=($t[20],$t[21]); + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + $score=$t[22]; + + }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ + + #if all three filtering + $link=floor($t[17]); + $eq_type=$t[19]; + $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); + + if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ + $insert_ratio=floor($1/$2*100)."%"; + ($bk1, $bk2)=($t[22],$t[23]); + $score=$t[24]; + + }else{ + ($bk1, $bk2)=($t[21],$t[22]); + $score=$t[23]; + } + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + + } + + + push(@sv, $chr_type, $sv_type,$eq_type); + push(@sv,"$chr1\t$start1-$end1"); + push(@sv, $link); + push(@sv,"$chr2\t$start2-$end2", + $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); + + + print SV join("\t",@sv)."\n"; + } + + close LINKS; + close SV; + + system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; + + open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; + my @links=<SV>; + close SV; + + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + print SV join("\t",@header)."\n"; + print SV @links; + close SV; + + unlink($sv_file.".sorted"); + + print LOG "-- output created: $sv_file\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub deleteBadOrderSensePairs{ + + my (@tab)=@_; + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)]//g; + push(@tab2,$v) if($v!~/[\$\*\@]$/); + } + return @tab2; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#gets starts and ends Coords when start=leftmost given positions, directions and orders +sub getCoordswithLeftMost { + + my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; + + for my $i (0..scalar(@{$positions})-1) { + + if($strand->[$i] eq 'F'){ + $starts->[$i]=$positions->[$i]; + $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; + }else{ + $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; + $ends->[$i]=$positions->[$i]; + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub floor { + my $nb = $_[0]; + $nb=~ s/\..*//; + return $nb; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub decimal{ + + my $num=shift; + my $digs_to_cut=shift; + + $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); + + return $num; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Sort links according the concerned chromosomes and their coordinates +sub sortLinks{ + + my ($links_file,$sortedlinks_file,$unique)=@_; + + print LOG "# Sorting links...\n"; + + my $pipe=($unique)? "| sort -u":""; + system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getColor{ + + my($count,$hcolor,$format)=@_; + for my $col ( keys % { $hcolor} ) { + return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); + } + return "white" if($format eq "circos"); + return "255,255,255" if($format eq "bed"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#check if the configuration file is correct +sub validateconfiguration{ + + my %conf=%{$_[0]}; + my $list_prgs="@ARGV"; + + my @circos_params=qw(organism_id colorcode); + my @bed_params=qw(colorcode); + my @compare_params=qw(list_samples list_read_lengths sample_link_file reference_link_file); + + unless (defined($conf{general}{output_dir})) { + $conf{general}{output_dir} = "."; + } + unless (-d $conf{general}{output_dir}){ + mkdir $conf{general}{output_dir} or die; + } + $conf{general}{output_dir}.="/" if($conf{general}{output_dir}!~/\/$/); + + + if($list_prgs=~/links2compare/){ + foreach my $p (@compare_params) { + die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); + } + + unless (defined($conf{compare}{same_sv_type})) { + $conf{compare}{same_sv_type} = 0; + } + + unless (defined($conf{compare}{min_overlap})) { + $conf{compare}{min_overlap} = 1E-9; + } + + if($conf{compare}{circos_output}){ + foreach my $p (@circos_params) { + next if($list_prgs=~/^ratio/ && $p eq "colorcode"); + die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); + } + } + if($conf{compare}{bed_output}){ + foreach my $p (@bed_params) { + die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); + } + die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); + + my @samples=split(",",$conf{compare}{list_samples}); + my @read_lengths=split(",",$conf{compare}{list_read_lengths}); + for my $i (0..$#samples){ + my @l=split("-",$read_lengths[$i]); + $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; + } + } + } + + +} +#------------------------------------------------------------------------------# +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/SVDetect_compare.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,218 @@ +<tool id="svdetect_compare" name="Compare"> + +<description>structural variants between two samples</description> + +<command interpreter="perl">SVDetect_compare.pl links2compare -conf '$config_file' -l '$log_file' -N '$sample_name.$reference_name' + +#if $links2SV +-out1 '$common_sv_file' +-out2 '$sample_sv_file' +-out3 '$reference_sv_file' +#end if + +#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2circos +-out4 '$common_circos_file' +-out5 '$sample_circos_file' +-out6 '$reference_circos_file' +#end if + +#if $file_conversion.file_conversion_select=="convert" and $file_conversion.links2bed +-out7 '$common_bed_file' +-out8 '$sample_bed_file' +-out9 '$reference_bed_file' +#end if + +</command> + +<inputs> + <param name="sample_name" type="text" size="20" value="sample" label="Sample Name"/> + <param name="sample_read1_length" type="integer" size="10" value="50" label="Sample read 1 length (bp)"/> + <param name="sample_read2_length" type="integer" size="10" value="50" label="Sample read 2 length (bp)"/> + <param name="sample_mates_file" type="data" format="sv" label="Sample input file" help=".sv file"/> + + <param name="reference_name" type="text" size="20" value="reference" label="Reference Name"/> + <param name="reference_read1_length" type="integer" size="10" value="50" label="Reference read 1 length (bp)"/> + <param name="reference_read2_length" type="integer" size="10" value="50" label="Reference read 2 length (bp)"/> + <param name="reference_mates_file" type="data" format="sv" label="Reference input file" help=".sv file"/> + + <param name="min_overlap" type="float" size="10" value="0.05" label="Minimum overlap of links required as a fraction"/> + <param name="same_sv_type" label="Comparison of SVs with the same type only ?" type="boolean" truevalue="1" falsevalue="0" checked="True"/> + + <param name="links2SV" label="Do you want to have filtered links in a tabulated file format showing significant SVs?" type="boolean" truevalue="1" falsevalue="0" checked="True"/> + + <conditional name="file_conversion"> + <param name="file_conversion_select" type="select" label="Output file conversion" help="Converts filtered links to Circos/BED files format for graphical view of SVs"> + <option value="do_not_convert">No</option> + <option value="convert">Yes</option> + </param> + <when value="do_not_convert"> + <!-- do nothing here --> + </when> + <when value="convert"> + <param name="links2circos" label="Converts the link list to the Circos link format" type="boolean" truevalue="1" falsevalue="0" checked="True"/> + <param name="links2bed" label="Converts the link list to the UCSC BED format" type="boolean" truevalue="1" falsevalue="0" checked="False"/> + <param name="organism_id" type="text" size="10" value="hs" label="Organism ID"/> + <repeat name="color_code" title="Color-code" min="1" max="7"> + <param name="color" type="select" label="Color"> + <option value="grey">grey</option> + <option value="black">black</option> + <option value="blue">blue</option> + <option value="green">green</option> + <option value="purple">purple</option> + <option value="orange">orange</option> + <option value="red">red</option> + </param> + <param name="interval" type="text" value="1,3" label="Interval"/> + </repeat> + </when> + </conditional> +</inputs> + + + +<outputs> + <data format="sv" name="common_sv_file" label="common.compared.sv"> + <filter>links2SV is True</filter> + </data> + <data format="sv" name="sample_sv_file" label="${sample_name}.compared.sv"> + <filter>links2SV is True</filter> + </data> + <data format="sv" name="reference_sv_file" label="${reference_name}.compared.sv"> + <filter>links2SV is True</filter> + </data> + + <data format="segdup" name="common_circos_file" label="common.compared.segdup"> + <filter>( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2circos'] is True + ) + </filter> + </data> + <data format="segdup" name="sample_circos_file" label="${sample_name}.compared.segdup"> + <filter>( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2circos'] is True + ) + </filter> + </data> + <data format="segdup" name="reference_circos_file" label="${reference_name}.compared.segdup"> + <filter>( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2circos'] is True + ) + </filter> + </data> + + <data format="bed" name="common_bed_file" label="common.compared.bed"> + <filter>( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2bed'] is True + ) + </filter> + </data> + <data format="bed" name="sample_bed_file" label="${sample_name}.compared.bed"> + <filter>( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2bed'] is True + ) + </filter> + </data> + <data format="bed" name="reference_bed_file" label="${reference_name}.compared.bed"> + <filter>( + file_conversion['file_conversion_select']=="convert" and + file_conversion['links2bed'] is True + ) + </filter> + </data> + + <data format="txt" name="log_file" label="${sample_name}.${reference_name}.svdetect_compare.log"/> +</outputs> + + + +<configfiles> + <configfile name="config_file"> +<general> +output_dir=$__new_file_path__/svdetect +</general> + +#if $file_conversion.file_conversion_select == "convert" +#if $file_conversion.links2circos +<circos> +organism_id=${file_conversion.organism_id} +<colorcode> +#for $color_repeat in $file_conversion.color_code +${color_repeat.color}=${color_repeat.interval} +#end for +</colorcode> +</circos> +#end if +#if $file_conversion.links2bed +<bed> +<colorcode> +#for $color_repeat in $file_conversion.color_code +#if str($color_repeat.color)== "grey" +190,190,190=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "black" +0,0,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "blue" +0,0,255=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "green" +0,255,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "purple" +153,50,205=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "orange" +255,140,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "red" +255,0,0=${color_repeat.interval} +#end if +#end for +</colorcode> +</bed> +#end if +#end if + +<compare> +list_samples=${sample_name},${reference_name} +list_read_lengths=${sample_read1_length}-${sample_read2_length},${reference_read1_length}-${reference_read2_length} +sample_link_file=${sample_mates_file} +reference_link_file=${reference_mates_file} +min_overlap=${min_overlap} +same_sv_type=${same_sv_type} +sv_output=${links2SV} +#if $file_conversion.file_conversion_select == "convert" +circos_output=${$file_conversion.links2circos} +bed_output=${$file_conversion.links2bed} +#end if +</compare> + + </configfile> +</configfiles> + + <help> +**What it does** + +SVDetect - Version : 0.8b + +Comparison of clusters between two samples to get common or sample-specific SVs + +This program is designed to compare filtered links between two anomalously mapped mate-pair/paired-end datasets +and to identify common and sample-specific SVs (like the usual sample/reference design). +Overlaps between coordinates of clusters and types of SVs are used as parameters of comparison. + +Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/SVDetect_import.sh Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,15 @@ +#!/bin/bash + + +while getopts "i:o:" optionName; do +case "$optionName" in + +i) INPUT="$OPTARG";; +o) OUTPUT="$OPTARG";; + +esac +done + +rm $OUTPUT + +ln -s $INPUT $OUTPUT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/SVDetect_import.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,85 @@ +<tool id="svdetect_import" name="Import data"> + <description>BAM, chromosome info or sv files</description> + <command interpreter="bash">SVDetect_import.sh -i $file_path + #if str($type.file_type)=="bam" + -o $outbamfile + #elif str($type.file_type)=="len" + -o $outlenfile + #elif str($type.file_type)=="sv" + -o $outsvfile + #end if + </command> + <inputs> + <param name="file_name" type="text" value="file1" label="File Name"/> + <conditional name="type"> + <param name="file_type" type="select" label="Select the file type to import" help="BAM file (BAM) or text file (SAM, chromosome list or a SV tabulated text file)"> + <option value="bam">BAM file (.bam)</option> + <option value="len">Chromosome info file (.len)</option> + <option value="sv">SVDetect output file (.sv)</option> + </param> + <when value="bam"> + <!-- do nothing here --> + </when> + <when value="len"> + <!-- do nothing here --> + </when> + <when value="sv"> + <!-- do nothing here --> + </when> + </conditional> + <param name="file_path" type="text" size="150" label="Path to file"/> + </inputs> + <outputs> + <data format="bam" name="outbamfile" label="${file_name}.bam"> + <filter>type['file_type']=="bam"</filter> + </data> + <data format="len" name="outlenfile" label="${file_name}.len"> + <filter>type['file_type']=="len"</filter> + </data> + <data format="sv" name="outsvfile" label="${file_name}.sv"> + <filter>type['file_type']=="sv"</filter> + </data> + </outputs> + <help> +**What it does** + +This tool allows you to import quickly a BAM file, a chromosome info file or a SVDetect output file from you computer as inputs for SVDetect. + + +**Example of chromosome file** + +Input len file:: + + 1 chr1 247249719 + 2 chr2 242951149 + 3 chr3 199501827 + 4 chr4 191273063 + 5 chr5 180857866 + 6 chr6 170899992 + 7 chr7 158821424 + 8 chr8 146274826 + 9 chr9 140273252 + 10 chr10 135374737 + 11 chr11 134452384 + 12 chr12 132349534 + 13 chr13 114142980 + 14 chr14 106368585 + 15 chr15 100338915 + 16 chr16 88827254 + 17 chr17 78774742 + 18 chr18 76117153 + 19 chr19 63811651 + 20 chr20 62435964 + 21 chr21 46944323 + 22 chr22 49691432 + 23 chrX 154913754 + 24 chrY 57772954 + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/SVDetect_run_parallel.pl Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,3537 @@ +#!/usr/bin/perl -w + +=pod + +=head1 NAME + +SVDetect - Program designed to the detection of structural variations +from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads + +Version: 0.8b for Galaxy + +=head1 SYNOPSIS + +SVDetect <command> -conf <configuration_file> [-help] [-man] + + Command: + + linking detection and isolation of links + filtering filtering of links according different parameters + links2circos links conversion to circos format + links2bed paired-ends of links converted to bed format (UCSC) + links2SV formatted output to show most significant SVs + cnv calculate copy-number profiles + ratio2circos ratio conversion to circos density format + ratio2bedgraph ratio conversion to bedGraph density format (UCSC) + +=head1 DESCRIPTION + +This is a command-line interface to SVDetect. + + +=head1 AUTHORS + +Bruno Zeitouni E<lt>bruno.zeitouni@curie.frE<gt>, +Valentina Boeva E<lt>valentina.boeva@curie.frE<gt> + +=cut + +# ------------------------------------------------------------------- + +use strict; +use warnings; + +use Pod::Usage; +use Getopt::Long; + +use Config::General; +use Tie::IxHash; +use FileHandle; +use Parallel::ForkManager; + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#PARSE THE COMMAND LINE +my %OPT; +GetOptions(\%OPT, + 'conf=s', + 'out1=s', #GALAXY + 'out2=s', #GALAXY + 'out3=s', #GALAXY + 'out4=s', #GALAXY + 'out5=s', #GALAXY + 'l=s', #GALAXY + 'N=s',#GALAXY + 'help',#GALAXY + 'man' + ); + +pod2usage() if $OPT{help}; +pod2usage(-verbose=>2) if $OPT{man}; +pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); + +pod2usage() if(@ARGV<1); + +tie (my %func, 'Tie::IxHash',linking=>\&createlinks, + filtering=>\&filterlinks, + links2circos=>\&links2circos, + links2bed=>\&links2bed, + links2compare=>\&links2compare, + links2SV=>\&links2SV, + cnv=>\&cnv, + ratio2circos=>\&ratio2circos, + ratio2bedgraph=>\&ratio2bedgraph); + +foreach my $command (@ARGV){ + pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); +} +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#READ THE CONFIGURATION FILE +my $conf=Config::General->new( -ConfigFile => $OPT{conf}, + -Tie => "Tie::IxHash", + -AllowMultiOptions => 1, + -LowerCaseNames => 1, + -AutoTrue => 1); +my %CONF= $conf->getall; +validateconfiguration(\%CONF); #validation of the configuration parameters + +my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY + +my $pt_log_file=$OPT{l}; #GALAXY +my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY +my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY +my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY +my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY +my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY + +$CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY +$CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY + +my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY +open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#COMMAND EXECUTION +foreach my $command (@ARGV){ + &{$func{$command}}(); +} +print LOG "-- end\n";#GALAXY + +close LOG;#GALAXY +system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY +exit(0); +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#FUNCTIONS +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 1: Detection of links from mate-pairs data +sub createlinks{ + + my %CHR; #main hash table 1: fragments, links + my %CHRID; + my @MATEFILES; + + my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; + my @path=split(/\//,$output_prefix); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; + my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; + + shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $CONF{general}{cmap_file}); + + if($CONF{detection}{split_mate_file}){ + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, + $CONF{general}{sv_type}, + $CONF{general}{mates_file}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + }else{ + + @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES); + print LOG "# Splitted mate files already created.\n"; + } + + + #Parallelization of the linking per chromosome for intra + interchrs + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $matefile (@MATEFILES){ + + my $pid = $pm->start and next; + getlinks(\%CHR, \%CHRID, $matefile); + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!"; + chomp(@LINKFILES); + catFiles( \@LINKFILES => "$output_prefix.links" ); + + system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY + print LOG "# Linking end procedure : output created: $output_prefix.links\n"; + #unlink(@LINKFILES); + #unlink(@MATEFILES); + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getlinks { + + my ($chr,$chrID,$tmp_mates_prefix)=@_; + + my $tmp_links_prefix=$tmp_mates_prefix; + $tmp_links_prefix=~s/\/mates\//\/links\//; + + my %PAIR; #main hash table 2: pairs + + linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs + $CONF{general}{read_lengths}, + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $tmp_mates_prefix, + $CONF{general}{input_format}, + $CONF{general}{sv_type}, + "$tmp_links_prefix.links.mapped" + ); + + getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons + "$tmp_links_prefix.links.unique"); + + defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links + $CONF{general}{input_format}, + $CONF{general}{sv_type}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.links.unique", + "$tmp_links_prefix.links.unique_defined"); + + sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates + "$tmp_links_prefix.links.sorted"); + + removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links + "$tmp_links_prefix.links",1); #file output + + + undef %PAIR; + + unlink("$tmp_links_prefix.links.mapped", + "$tmp_links_prefix.links.unique", + "$tmp_links_prefix.links.unique_defined", + "$tmp_links_prefix.links.sorted"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub splitMateFile{ + + my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_; + + print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n"; + + my %filesHandle; + + #fichier matefile inter + if($sv_type=~/^(all|inter)$/){ + my $newFileName="$output_prefix.interchrs"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{inter}=$fh; + } + + #fichiers matefiles intra + if($sv_type=~/^(all|intra)$/){ + foreach my $k (1..$chr->{nb_chrs}){ + my $newFileName=$output_prefix.".".$chr->{$k}->{name}; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{$k}=$fh; + } + } + + if ($mates_file =~ /.gz$/) { + open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat + }elsif($mates_file =~ /.bam$/){ + open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; + } + + + while(<MATES>){ + + my @t=split; + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ + my $fh2print=$filesHandle{inter}; + print $fh2print join("\t",@t)."\n"; + } + + if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ + my $fh2print=$filesHandle{$chr_read1}; + print $fh2print join("\t",@t)."\n"; + + } + } + + foreach my $name (keys %filesHandle){ + my $fh=$filesHandle{$name}; + $fh->close; + } + + print LOG "# Splitted mate files of \"$mates_file\" created.\n"; +} + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub filterlinks{ + + my %CHR; + my %CHRID; + my @LINKFILES; + my @FLINKFILES; + + my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; + my @path=split(/\//,$output_prefix); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; + + createChrHashTables(\%CHR,\%CHRID, + $CONF{general}{cmap_file}); + + if($CONF{filtering}{split_link_file}){ + + splitLinkFile(\%CHR, \%CHRID, \@LINKFILES, + $tmp_links_prefix, + $CONF{general}{sv_type}, + "$output_prefix.links", + ); + }else{ + + @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n"; + chomp(@LINKFILES); + print LOG "# Splitted link files already created.\n"; + } + + #Parallelization of the filtering per chromosome for intra + interchrs + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $linkfile (@LINKFILES){ + + my $pid = $pm->start and next; + getFilteredlinks(\%CHR, \%CHRID, $linkfile); + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n"; + chomp(@FLINKFILES); + catFiles( \@FLINKFILES => "$output_prefix.links.filtered" ); + + system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY + print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n"; + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub splitLinkFile{ + + my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_; + + print LOG "# Splitting the link file for parallel processing...\n"; + + my %filesHandle; + + #fichier matefile inter + if($sv_type=~/^(all|inter)$/){ + my $newFileName="$input_prefix.interchrs.links"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{inter}=$fh; + } + + #fichiers matefiles intra + if($sv_type=~/^(all|intra)$/){ + foreach my $k (1..$chr->{nb_chrs}){ + my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{$k}=$fh; + } + } + + open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n"; + while(<LINKS>){ + + my @t=split; + my ($chr_read1,$chr_read2)=($t[0],$t[3]); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ + my $fh2print=$filesHandle{inter}; + print $fh2print join("\t",@t)."\n"; + } + + if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ + my $fh2print=$filesHandle{$chr_read1}; + print $fh2print join("\t",@t)."\n"; + + } + } + + foreach my $name (keys %filesHandle){ + my $fh=$filesHandle{$name}; + $fh->close; + } + + print LOG "# Splitted link files created.\n"; +} + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 2: Filtering processing +sub getFilteredlinks { + + my ($chr,$chrID,$tmp_links_prefix)=@_; + my %PAIR; + + strandFiltering($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, #filtering of links + $CONF{filtering}{strand_filtering}, + $CONF{filtering}{chromosomes}, + $CONF{general}{input_format}, + $CONF{general}{cmap_file}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + $tmp_links_prefix, + "$tmp_links_prefix.filtered", + ); + + if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering + + my @tmpfiles; + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique"); + + getUniqueLinks("$tmp_links_prefix.filtered_unique", + "$tmp_links_prefix.filtered"); + + push(@tmpfiles,"$tmp_links_prefix.filtered_unique"); + + if($CONF{filtering}{order_filtering}){ #filtering using the order + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered"); + + orderFiltering($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, + $CONF{filtering}{nb_pairs_order_threshold}, + $CONF{filtering}{mu_length}, + $CONF{filtering}{sigma_length}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.filtered_ordered", + "$tmp_links_prefix.filtered", + ); + + push(@tmpfiles,"$tmp_links_prefix.filtered_ordered"); + } + + if (($CONF{filtering}{insert_size_filtering})&& + ($CONF{general}{sv_type} ne 'inter')){ + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize"); + + addInsertionInfo($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, + $CONF{filtering}{order_filtering}, + $CONF{filtering}{indel_sigma_threshold}, + $CONF{filtering}{dup_sigma_threshold}, + $CONF{filtering}{singleton_sigma_threshold}, + $CONF{filtering}{mu_length}, + $CONF{filtering}{sigma_length}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.filtered_withoutIndelSize", + "$tmp_links_prefix.filtered" + ); + + push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize"); + } + + sortLinks("$tmp_links_prefix.filtered", + "$tmp_links_prefix.filtered_sorted"); + + removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted", + "$tmp_links_prefix.filtered_nodup", + ); + + postFiltering("$tmp_links_prefix.filtered_nodup", + "$tmp_links_prefix.filtered", + $CONF{filtering}{final_score_threshold}); + + push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup"); + + unlink(@tmpfiles); + + + } + undef %PAIR; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 3: Circos format conversion for links +sub links2circos{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".segdup.txt"; + + links2segdup($CONF{circos}{organism_id}, + $CONF{circos}{colorcode}, + $input_file, + $output_file); #circos file output + + system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 4: Bed format conversion for links +sub links2bed{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".bed.txt"; + + links2bedfile($CONF{general}{read_lengths}, + $CONF{bed}{colorcode}, + $input_file, + $output_file); #bed file output + + system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 6: Bed format conversion for links +sub links2SV{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".sv.txt"; + + + links2SVfile( $input_file, + $output_file); + + system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 7: copy number variations, coverage ratio calculation +sub cnv{ + + my %CHR; + my %CHRID; + my @MATEFILES; + my @MATEFILES_REF; + + my $output_prefix=$CONF{general}{mates_file}; + my $output_prefix_ref=$CONF{detection}{mates_file_ref}; + my @path=split(/\//,$output_prefix); + my @path_ref=split(/\//,$output_prefix_ref); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref]; + my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; + my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref]; + my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path]; + + shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $CONF{general}{cmap_file}); + + if($CONF{detection}{split_mate_file}){ + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, + "intra", + $CONF{general}{mates_file}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref, + "intra", + $CONF{detection}{mates_file_ref}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + + + }else{ + + @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES); + @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES_REF); + print LOG "# Splitted sample and reference mate files already created.\n"; + } + + #Parallelization of the cnv per chromosome + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $file (0..$#MATEFILES){ + + my $pid = $pm->start and next; + + densityCalculation(\%CHR, \%CHRID, $file, + $CONF{general}{read_lengths}, + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + \@MATEFILES, + \@MATEFILES_REF, + $MATEFILES[$file].".density", + $CONF{general}{input_format}); + + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!"; + chomp(@DENSITYFILES); + catFiles( \@DENSITYFILES => "$output_prefix.density" ); + + print LOG "# cnv end procedure : output created: $output_prefix.density\n"; + + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 8: Circos format conversion for cnv ratios +sub ratio2circos{ + + my $input_file=$CONF{general}{mates_file}.".density"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".segdup.txt"; + + ratio2segdup($CONF{circos}{organism_id}, + $input_file, + $output_file); +} +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios +sub ratio2bedgraph{ + + my $input_file=$CONF{general}{mates_file}.".density"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".bedgraph.txt"; + + ratio2bedfile($input_file, + $output_file); #bed file output +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Creation of the fragment library +sub shearingChromosome{ + + print LOG "# Making the fragments library...\n"; + + my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters + + createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name + + foreach my $k (1..$chr->{nb_chrs}){ + + print LOG"-- $chr->{$k}->{name}\n"; + + my $frag=1; + for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){ + + my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1; + $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage + + if($end==($chr->{$k}->{length})-1){ + $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome + last; + } + $frag++; + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Creation of chromosome hash tables from the cmap file +sub createChrHashTables{ + + my ($chr,$chrID,$cmap_file)=@_; + $chr->{nb_chrs}=0; + + open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n"; + while(<CMAP>){ + + if(/^\s+$/){ next;} + my ($k,$name,$length) = split; + $chr->{$k}->{name}=$name; + $chr->{$k}->{length}=$length; + $chrID->{$name}=$k; + $chr->{nb_chrs}++; + + } + close CMAP; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Read the mate file according the input format file (solid, eland or sam) +sub readMateFile{ + + my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_; + my ($strand1,$strand2); + + if($file_type eq "solid"){ + + ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based + + }else{ + my ($tag_length1,$tag_length2); + ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based + if($file_type eq "eland"); + + if($file_type eq "sam"){ + + return 0 if ($$t[0]=~/^@/); #header sam filtered out + + ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]); + + return 0 if (($$t[1]&0x0004) || ($$t[1]&0x0008)); + + $$chr2=$$chr1 if($$chr2 eq "="); + + $strand1 = (($$t[1]&0x0010))? 'R':'F'; + $strand2 = (($$t[1]&0x0020))? 'R':'F'; + + $$order1= (($$t[1]&0x0040))? '1':'2'; + $$order2= (($$t[1]&0x0080))? '1':'2'; + $tag_length1 = $tag_length->{$$order1}; + $tag_length2 = $tag_length->{$$order2}; + } + + $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts + $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R"); + } + return 1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Parsing of the mates files and creation of links between 2 chromosomal fragments +sub linking{ + + my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_; + my %link; + + my $record=0; + my $nb_links=0; + my $warn=10000; + + my @sfile=split(/\./,$mates_file); + my $fchr=$sfile[$#sfile]; + + my $fh = new FileHandle; + + print LOG "# $fchr : Linking procedure...\n"; + print LOG "-- file=$mates_file\n". + "-- chromosome=$fchr\n". + "-- input format=$input_format\n". + "-- type=$sv_type\n". + "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n". + "-- window size=$window_dist, step length=$step\n"; + + if ($mates_file =~ /.gz$/) { + $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat + }elsif($mates_file =~ /.bam$/){ + $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; + } + + + while(<$fh>){ + + my @t=split; #for each mate-pair + my $mate=$t[0]; + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); + + next if(exists $$pair{$mate}); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if($sv_type ne "all"){ + if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) || + ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){ + }else{ + next; + } + } + + $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function) + + $record++; + + my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read + + recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); + recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); + + for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation + + if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ + + if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){ + + for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){ + + if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) { + + if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){ + + makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link + } + + }else{ + + $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step); + } + } + } + + }else{ + + $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); + } + } + + if($record>=$warn){ + print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n"; + $warn+=10000; + } + } + $fh->close; + + if(!$nb_links){ + print LOG "-- $fchr : No mate-pairs !\n". + "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n"; + } + + print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n"; + + print LOG "-- $fchr : writing...\n"; + + $fh = new FileHandle; + + $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n"; + + foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output + + foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){ + + foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){ + + foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){ + + my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2}); + print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t". + "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t". + scalar @count."\t". #nb of read + $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list + } + } + } + } + + $fh->close; + + undef %link; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#remove exact links doublons according to the mate list +sub getUniqueLinks{ + + my ($links_file,$nrlinks_file)=@_; + my %links; + my %pt; + my $nb_links; + my $n=1; + + my $record=0; + my $warn=300000; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + + print LOG "# $fchr : Getting unique links...\n"; + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + + while(<$fh>){ + + my @t=split; + my $mates=$t[7]; + $record++; + + if(!exists $links{$mates}){ #Unique links selection + + $links{$mates}=[@t]; + $pt{$n}=$links{$mates}; + $n++; + + + }else{ #get the link coordinates from the mate-pairs list + + for my $i (1,2,4,5){ #get the shortest regions + + $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start + if($i==1 || $i==4); + $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end + if($i==2 || $i==5); + } + } + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n"; + $warn+=300000; + } + } + $fh->close; + + $nb_links=$n-1; + print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n"; + + $fh = new FileHandle; + $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; + print LOG "-- $fchr : writing...\n"; + for my $i (1..$nb_links){ + + print $fh join("\t",@{$pt{$i}})."\n"; #all links output + } + + $fh->close; + + undef %links; + undef %pt; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#get the new coordinates of each link from the mate list +sub defineCoordsLinks{ + + my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n"; + + print LOG "# $fchr : Defining precise link coordinates...\n"; + + my $record=0; + my $warn=100000; + + my %coords; + my %strands; + my %order; + my %ends_order; + + while(<$fh>){ + + + my ($col1,$col2)=(1,2); #for an intrachromosomal link + my $diffchr=0; #difference between chr1 and chr2 + my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8]; + ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); + if ($chr1 != $chr2){ #for an interchromosomal link + $col1=$col2=0; #no distinction + $diffchr=1; + } + + my @pairs=split(",",$mates_list); + + $coords{$col1}{$chr1}->{start}=undef; + $coords{$col1}{$chr1}->{end}=undef; + $coords{$col2}{$chr2}->{start}=undef; + $coords{$col2}{$chr2}->{end}=undef; + $strands{$col1}{$chr1}=undef; + $strands{$col2}{$chr2}=undef; + $ends_order{$col1}{$chr1}=undef; + $ends_order{$col2}{$chr2}=undef; + + + $order{$col1}{$chr1}->{index}->{1}=undef; + $order{$col1}{$chr1}->{index}->{2}=undef; + $order{$col2}{$chr2}->{index}->{1}=undef; + $order{$col2}{$chr2}->{index}->{2}=undef; + $order{$col1}{$chr1}->{order}=undef; + $order{$col2}{$chr2}->{order}=undef; + + $record++; + + for my $p (0..$#pairs){ #for each pair + + my ($coord_start_read1,$coord_end_read1); + my ($coord_start_read2,$coord_end_read2); + my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format); + my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format); + + if(!$diffchr){ #for a intrachromosomal link + if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column + ($col1,$col2)=(2,1); + }else{ + ($col1,$col2)=(1,2); + } + } + + push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads + push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1); + push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2); + push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2); + push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1); + push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2); + push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]); + push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]); + } + + ($col1,$col2)=(1,2) if(!$diffchr); + + my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region + my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}})); + my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}})); + my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}})); + + @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}}; + @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}}; + + foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting) + foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){ + + if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){ + ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i; + ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j; + last; + } + } + } + + foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID + foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){ + + if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){ + ${$order{$col1}{$chr1}->{order}}[$i]=$i+1; + ${$order{$col2}{$chr2}->{order}}[$i]=$j+1; + last; + } + } + } + + @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads + @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1}); + @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2}); + @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1}); + @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2}); + @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start}); + @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start}); + + + my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output + $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, + scalar @pairs, + join(",",@pairs), + join(",",@{$strands{$col1}{$chr1}}), + join(",",@{$strands{$col2}{$chr2}}), + join(",",@{$ends_order{$col1}{$chr1}}), + join(",",@{$ends_order{$col2}{$chr2}}), + join(",",@{$order{$col1}{$chr1}->{order}}), + join(",",@{$order{$col2}{$chr2}->{order}}), + join(",",@{$coords{$col1}{$chr1}->{start}}), + join(",",@{$coords{$col2}{$chr2}->{start}})); + + print $fh2 join("\t",@link)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed\n"; + $warn+=100000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links processed\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Sort links according the concerned chromosomes and their coordinates +sub sortLinks{ + + my ($links_file,$sortedlinks_file,$unique)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + print LOG "# $fchr : Sorting links...\n"; + + my $pipe=($unique)? "| sort -u":""; + system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#removal of fully overlapped links +sub removeFullyOverlappedLinks{ + + my ($links_file,$nrlinks_file,$warn_out)=@_; + + my %pt; + my $n=1; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + while(<$fh>){ + + my @t=split("\t",$_); + $pt{$n}=[@t]; + $n++; + } + $fh->close; + + my $nb_links=$n-1; + my $nb=$nb_links; + + my %pt2; + my $nb2=1; + my $record=0; + my $warn=10000; + + print LOG "# $fchr : Removing fully overlapped links...\n"; + + LINK: + + for my $i (1..$nb){ + + my @link=(); + my @next_link=(); + my $ind1=$i; + + $record++; + if($record>=$warn){ + print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n"; + $warn+=10000; + } + + if(exists $pt{$ind1}){ + @link=@{$pt{$ind1}}; #link1 + }else{ + next LINK; + } + + my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1 + my @mates=deleteBadOrderSensePairs(split(",",$link[7])); + + my $ind2=$ind1+1; + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link + + if($ind2<=$nb){ + + @next_link=@{$pt{$ind2}}; #link2 + my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2 + my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + + while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter + + if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2 + + $ind2++; + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); + + if($ind2>$nb){ #if no more link in the file -> save link1 + + $pt2{$nb2}=\@link; + $nb2++; + next LINK; + } + + @next_link=@{$pt{$ind2}}; + ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); + @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + next; + } + + my %mates=map{$_ =>1} @mates; #get the equal number of mates + my @same_mates = grep( $mates{$_}, @next_mates ); + my $nb_mates= scalar @same_mates; + + if($nb_mates == scalar @mates){ + + delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1 + next LINK; #go to link2, link2 becomes link1 + + }else{ + delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2 + $ind2++; #we continue by checking the next link2 + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); + + if($ind2>$nb){ #if no more link in the file -> save link1 + + $pt2{$nb2}=\@link; + $nb2++; + next LINK; + } + + @next_link=@{$pt{$ind2}}; #get info of link2 + ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); + @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + + } + } + } + $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1 + $nb2++; + } + + print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n"; + + #OUTPUT + + $fh = new FileHandle; + $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; + print LOG "-- $fchr : writing...\n"; + for my $i (1..$nb2-1){ + + print $fh join("\t",@{$pt2{$i}}); #all links output + } + + close $fh; + + print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out); + + undef %pt; + undef %pt2; +} +#------------------------------------------------------------------------------# +sub postFiltering { + + my ($links_file,$pflinks_file, $finalScore_thres)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my ($nb,$nb2)=(0,0); + + print LOG "# $fchr : Post-filtering links...\n"; + print LOG "-- $fchr : final score threshold = $finalScore_thres\n"; + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n"; + + + while(<$fh>){ + + my @t=split("\t",$_); + my $score=$t[$#t-1]; + + if($score >= $finalScore_thres){ + print $fh2 join("\t", @t); + $nb2++; + } + $nb++; + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n"; + print LOG "-- $fchr : output created: $pflinks_file\n"; +} + + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Filtering of the links +sub strandFiltering{ + + my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes, + $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-1]; + + + my %chrs; + my %chrs1; + my %chrs2; + my $nb_chrs; + my $exclude; + + if($chromosomes){ + my @chrs=split(",",$chromosomes); + $nb_chrs=scalar @chrs; + $exclude=($chrs[0]=~/^\-/)? 1:0; + for my $chrName (@chrs){ + $chrName=~s/^(\-)//; + my $col=($chrName=~s/_(1|2)$//); + + if(!$col){ + $chrs{$chrID->{$chrName}}=undef + }else{ + $chrs1{$chrID->{$chrName}}=undef if($1==1); + $chrs2{$chrID->{$chrName}}=undef if($1==2); + } + } + } + + my $record=0; + my $nb_links=0; + my $warn=10000; + + my $sens_ratio_threshold=0.6; + + print LOG "\# Filtering procedure...\n"; + print LOG "\# Number of pairs and strand filtering...\n"; + print LOG "-- file=$links_file\n"; + print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no"). + ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n"; + + + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + my @t=split; #for each link + my $is_good=1; + $record++; + + + if($chromosomes){ + + my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]}); + + if(!$exclude){ + $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0; + $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good); + $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good); + }else{ + $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1; + $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good); + } + } + + $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs + if($is_good && $strand_filtering){ #if filtering according the strand sense + + my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense + my @strands1=split(/,/,$t[8]); + my @strands2=split(/,/,$t[9]); + + my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0); + + my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR + 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF + + my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE", + substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE", + inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE"); + + if($t[6] == 1){ + + push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1); + + }else{ + + tie (my %class,'Tie::IxHash'); + my $split; + + foreach my $i (0..$#mates){ + $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group + } + + my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR}; + my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF}; + my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class); + + if($sens_ratio < $sens_ratio_threshold){ + %class=(1=>'FF', 2=>'FR'); + $split=1; + }else{ + $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class + $split=0; + } + + $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold); + } + } + + if($is_good){ #PRINT + + my $nb=scalar @t; + if($nb > 20){ + my @t2=splice(@t,0,20); + print $fh2 join("\t",@t2)."\n"; + $nb_links++; + } + $nb_links++; + print $fh2 join("\t",@t)."\n"; + } + + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; + $warn+=10000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links); + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getConsistentSenseLinks{ + + my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_; + + my $npairs=scalar @$mates; + + my @ends_order1 = split (/,/,$$t[10]); + my @ends_order2 = split (/,/,$$t[11]); + my @order1 = split (/,/,$$t[12]); + my @order2 = split (/,/,$$t[13]); + my @positions1 = split (/,/,$$t[14]); + my @positions2 = split (/,/,$$t[15]); + + my @newlink; + + foreach my $ind (keys %{$class} ){ + + tie (my %flink,'Tie::IxHash'); + my @orders2remove=(); + + foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group + + if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){ + push(@{$flink{mates}},$$mates[$i]); + push(@{$flink{strands1}},$$strands1[$i]); + push(@{$flink{strands2}},$$strands2[$i]); + push(@{$flink{ends_order1}},$ends_order1[$i]); + push(@{$flink{ends_order2}},$ends_order2[$i]); + push(@{$flink{positions1}},$positions1[$i]); + push(@{$flink{positions2}},$positions2[$i]); + + }else{ + push(@orders2remove,$order1[$i]); + } + } + + @{$flink{order1}}=(); + @{$flink{order2}}=(); + if(scalar @orders2remove > 0){ + getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2}) + }else{ + @{$flink{order1}}=@order1; + @{$flink{order2}}=@order2; + } + + my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length); + my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length); + + my $fnpairs=scalar @{$flink{mates}}; + my $strand_filtering_ratio=$fnpairs."/".$npairs; + my $real_ratio=$fnpairs/$npairs; + + if($fnpairs>=$thres){ #filtering according the number of pairs + + push(@newlink, + $$t[0], + min(min(@{$flink{positions1}}),min(@ends1)), + max(max(@{$flink{positions1}}),max(@ends1)), + $$t[3], + min(min(@{$flink{positions2}}),min(@ends2)), + max(max(@{$flink{positions2}}),max(@ends2)), + $fnpairs, + join(",",@{$flink{mates}}), + join(",",@{$flink{strands1}}), + join(",",@{$flink{strands2}}), + join(",",@{$flink{ends_order1}}), + join(",",@{$flink{ends_order2}}), + join(",",@{$flink{order1}}), + join(",",@{$flink{order2}}), + join(",",@{$flink{positions1}}), + join(",",@{$flink{positions2}}), + $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]}, + $strand_filtering_ratio, + $real_ratio, + $npairs + ); + } + } + + if (grep {defined($_)} @newlink) { + @$t=@newlink; + return 1 + } + return 0; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getNewOrders{ + + my($tab1,$tab2,$list,$newtab1,$newtab2)=@_; + my $j=1; + my $k=1; + for my $i (0..$#{$tab2}){ + my $c=0; + for my $j (0..$#{$list}){ + $c++ if(${$list}[$j] < ${$tab2}[$i]); + if(${$list}[$j] == ${$tab2}[$i]){ + $c=-1; last; + } + } + if($c!=-1){ + push(@{$newtab2}, ${$tab2}[$i]-$c); + push(@{$newtab1}, $k); + $k++; + } + } +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Filtering of the links using their order +sub orderFiltering { + + my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; + + my $record=0; + my $warn=10000; + my $nb_links=0; + + my $quant05 = 1.644854; + my $quant001 = 3.090232; + my $alphaDist = $quant05 * 2 * $sigma; + my $maxFragmentLength = &floor($quant001 * $sigma + $mu); + + print LOG "\# Filtering by order...\n"; + print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n"; + print LOG "-- distance between comparable pairs was set to $alphaDist\n"; + print LOG "-- maximal fragment length was set to $maxFragmentLength\n"; + + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + $record++; + my @t = split; + my ($chr1,$chr2,$mates_list)=@t[0,3,7]; + my @pairs=split(",",$mates_list); + ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); + my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5]; + my $numberOfPairs = $t[6]; + my @strand1 = split (/,/,$t[8]); + my @strand2 = split (/,/,$t[9]); + my @ends_order1 = split (/,/,$t[10]); + my @ends_order2 = split (/,/,$t[11]); + my @order1 = split (/,/,$t[12]); + my @order2 = split (/,/,$t[13]); + my @positions1 = split (/,/,$t[14]); + my @positions2 = split (/,/,$t[15]); + my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length); + my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length); + my $clusterCoordinates_chr1; + my $clusterCoordinates_chr2; + my $reads_left = 0; + + my $ifRenv = $t[16]; + my $strand_ratio_filtering=$t[17]; + + #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if <F-R> orientation is correct + my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense); + #find pairs type F-RRRR or FFFF-R in the case if <R-F> orientation is correct + #These pairs are annotated as BED pairs forever! They won't be recycled! + my $table; + for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs + my $nonAdeq = 0; + for my $j (0..$i-1) { + if (exists($table->{$j}->{$i})) { + $nonAdeq++; + $table->{$i}->{$j} = 1; + } + } + for my $j ($i+1..$numberOfPairs-1) { + if ($positions1[$j]-$positions1[$i]>$alphaDist) { + if (&reversed ($i,$j,$ifRenv,\@positions2)) { + $nonAdeq++; + $table->{$i}->{$j} = 1; + } + } + } + $table->{$i}->{nonAdeq} = $nonAdeq; + } + + for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of <R-F> orientation + &remove($bad,$table); + } + + my @falseReads; + #RRRR-F -> RRRR or R-FFFF -> FFFF + @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table}); + #these pairs will be recycled later as $secondTable + for my $bad (@falseReads) { + &remove($bad,$table); + } + + my $bad = &check($table); + while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER + # push (@falseReads, $bad); remove completely!!! + &remove($bad,$table); + $bad = &check($table); + } + + $reads_left = scalar keys %{$table}; + my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}])); + my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}])); + my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}])); + my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}])); + + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + + my $ifBalanced = 'UNBAL'; + my $secondTable; + my $clusterCoordinates; + my ($break_pont_chr1,$break_pont_chr2); + + my $signatureType=""; + + my $maxCoord1 =$chr->{$chr1}->{length}; + my $maxCoord2 =$chr->{$chr2}->{length}; + + if (scalar @falseReads) { + @falseReads = sort @falseReads; + #now delete FRFR choosing the majority + my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF + @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled + my %hashTmp; + for my $count1 (0..scalar(@falseReads)-1) { + my $i = $falseReads[$count1]; + $hashTmp{$i} = 1; + for my $bad (@newfalseReads) { + if ($bad == $i) { + delete $hashTmp{$i}; + next; + } + } + } + @falseReads = sort keys %hashTmp; #what is left + for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected + my $nonAdeq = 0; + my $i = $falseReads[$count1]; + + for my $count2 (0..$count1-1) { + my $j = $falseReads[$count2]; + if (exists($secondTable->{$j}->{$i})) { + $nonAdeq++; + $secondTable->{$i}->{$j} = 1; + } + } + for my $count2 ($count1+1..scalar(@falseReads)-1) { + my $j = $falseReads[$count2]; + if ($positions1[$j]-$positions1[$i]>$alphaDist) { + if (&reversed ($i,$j,$ifRenv,\@positions2)) { + $nonAdeq++; + $secondTable->{$i}->{$j} = 1; + } + } + } + $secondTable->{$i}->{nonAdeq} = $nonAdeq; + } + + my @falseReads2; + my $bad = &check($secondTable); + while ($bad ne "OK") { #clear the table to reject non adequate pairs + push (@falseReads2, $bad); + &remove($bad,$secondTable); + $bad = &check($secondTable); + } + if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) { + my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}])); + + $ifBalanced = 'BAL'; + + if ($ifBalanced eq 'BAL') { + + if (scalar keys %{$table} < $nb_pairs_order_threshold) { + $ifBalanced = 'UNBAL'; #kill cluster 1! + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + $reads_left = scalar keys %{$table}; + $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; + $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; + $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; + $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + + } else { + + $reads_left += scalar keys %{$secondTable}; + next if ($reads_left < $nb_pairs_threshold); + + if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) { + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + + ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1); + ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1); + ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1); + ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1); + + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2; + }else { + $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')'; + $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')'; + } + $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2); + $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2); + $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2); + $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2); + #to calculate breakpoints one need to take into account read orientation in claster.. + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + + my @index1 = keys %{$table}; + my @index2 = keys %{$secondTable}; + + my (@generalStrand1,@generalStrand2) = 0; + + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1); + @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2); + } else { + @generalStrand1 = @strand1; + @generalStrand2 = @strand2; # TODO check if it is correct + } + if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F) + $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')'; + + if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; + $signatureType = "TRANSLOC"; + } else { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INS_FRAGMT"; + } + + } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INV_INS_FRAGMT"; + } else { + $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; + $signatureType = "INV_TRANSLOC"; + } + } else { + #should not occur + print STDERR "\nError in orderFiltering\n\n"; + } + } + + elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R) + $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')'; + $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')'; + if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; + $signatureType = "INV_INS_FRAGMT"; + } else { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INV_COAMPLICON"; + } + + } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "COAMPLICON"; + } else { + $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; + $signatureType = "INS_FRAGMT"; + } + } else { + #should not occur + $signatureType = "UNDEFINED"; + } + } + else { # (F,F) or (R,R) something strange. We will discard the smallest cluster + $ifBalanced = 'UNBAL'; + if (scalar keys %{$secondTable} > scalar keys %{$table}) { + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + + $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; + $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; + $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; + $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + } + $reads_left = scalar keys %{$table}; + } + if ($ifBalanced eq 'BAL') { + $ifRenv = $signatureType; + } + } + } + } + } + if ($ifBalanced ne 'BAL') { + #define possible break point + $coord_start_chr1 = $coord_start_chr1_cluster1; + $coord_end_chr1 = $coord_end_chr1_cluster1; + $coord_start_chr2 = $coord_start_chr2_cluster1; + $coord_end_chr2 = $coord_end_chr2_cluster1; + + my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1; + my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my @index = keys %{$table}; + unless ($diff_sense_ends) { + my $firstEndOrder1 = $ends_order1[$index[0]]; + my $firstEndOrder2 = $ends_order2[$index[0]]; + $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; + $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; + } else { + $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; + $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; + } + + if ($chr1 ne $chr2){ + $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE"); + $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE"); + } + } + + if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) { + next; #discard the link + } + if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) { + next; #discard the link + } + my $ratioTxt = "$reads_left/".(scalar @pairs); + my ($n1,$nTot) = split ("/",$strand_ratio_filtering); + my $ratioReal = $reads_left/$nTot; + + if ($coord_start_chr1<=0) { + $coord_start_chr1=1; + } + if ($coord_start_chr2<=0) { + $coord_start_chr2=1; + } + #create output + my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output + $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, + $reads_left, + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2), + &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1), + &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2), + $ifRenv, + $strand_ratio_filtering, + $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2, + $ratioReal, $nTot); + + $nb_links++; + print $fh2 join("\t",@link)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; + $warn+=10000; + } + + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#gets information about ends positions given start, direction and order +sub getEnds { + my ($ends,$starts,$strand,$end_order,$tag_length) = @_; + for my $i (0..scalar(@{$starts})-1) { + $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length); + } +} +sub getEnd { + my ($start,$strand, $end_order,$tag_length) = @_; + return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#gets starts and ends Coords when start=leftmost given positions, directions and orders +sub getCoordswithLeftMost { + + my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; + + for my $i (0..scalar(@{$positions})-1) { + + if($strand->[$i] eq 'F'){ + $starts->[$i]=$positions->[$i]; + $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; + }else{ + $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; + $ends->[$i]=$positions->[$i]; + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering + + my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; + + my $record=0; + my $nb_links=0; + my $warn=10000; + + print LOG "\# Filtering out normal pairs using insert size...\n"; + print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n"; + print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-". + ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n"; + print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-". + ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n"; + print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF"); + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + $record++; + my @t = split; + my ($chr1,$chr2,$mates_list)=@t[0,3,7]; + + if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all), + $nb_links++; + + $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE"); + $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE"); + + $t[16].= "\t"; + $t[19].= "\t"; + + print $fh2 join("\t",@t)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; + $warn+=10000; + } + next; + } + + my $ifRenv = $t[16]; + my $ifBalanced = "UNBAL"; + $ifBalanced = $t[18] if ($order_filtering); + + my $numberOfPairs = $t[6]; + my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14])); + my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15])); + + if ($ifBalanced eq "BAL") { + + if ($ifRenv eq "INV_TRANSLOC") { + $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment + } + if ($ifRenv eq "NORMAL_SENSE") { + $ifRenv = "TRANSLOC"; + } + if ($ifRenv eq "REVERSE_SENSE") { + $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment + } + $t[19].= "\t"; + + my $meanDistance = 0; + + for my $i (0..$numberOfPairs-1) { + $meanDistance += $positions2[$i]-$positions1[$i]; + } + $meanDistance /= $numberOfPairs; + + $t[16] = $ifRenv."\t".$meanDistance; + #dont touch the annotation. It should be already OK. + + } else { + #only for unbalanced + + my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]); + + my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0]. + (deleteBadOrderSensePairs(split (/,/,$t[9])))[0]; + my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0]. + (deleteBadOrderSensePairs(split (/,/,$t[11])))[0]; + + my $indel_type = $ifRenv; + + my $meanDistance = "N/A"; + + ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster + \@positions1, #assign N/A to $indel_type if unknown + \@positions2, + $ifRenv, + $ifoverlap, + $indel_sigma_threshold, + $dup_sigma_threshold, + $singleton_sigma_threshold, + $mu, + $sigma, + $ifBalanced, + $ends_sense_class, + $ends_order_class, + $mate_sense, + $diff_sense_ends, + ); + + #filtering of pairs with distance inconsistant with the SV + if ($ifRenv ne "REVERSE_SENSE") { + my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length}; + my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length}; + $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense, + $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering); + next if ($t[6] < $nb_pairs_threshold); + }else{ + $t[19].= "\t"; + } + $t[16] = $indel_type."\t".$meanDistance; + } + + $nb_links++; + + print $fh2 join("\t",@t)."\n"; + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; + $warn+=10000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub checkIndel { + + my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold, + $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_; + + my $meanDistance = 0; + + for my $i (0..$numberOfPairs-1) { + $meanDistance += $positions2->[$i]-$positions1->[$i]; + } + $meanDistance /= $numberOfPairs; + + return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) ); + + return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE"); + + if($diff_sense_ends){ + return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; + return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense))); + }else{ + return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) && + ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; + } + + return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap); + + return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ; + + return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma); + return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma); + + return ($meanDistance, "UNDEFINED"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size ) +sub recalc_t_usingInsertSizeInfo { + my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_; + + my @badPairs; + + my @positions1 = getAllEntries($t->[14]); + my @positions2 = getAllEntries($t->[15]); + + if ($meanDistance < $mu) { + for my $i (0..scalar(@positions1)-1) { + if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) { + push(@badPairs,$i); + } + } + } else { + for my $i (0..scalar(@positions1)-1) { + if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) { + push(@badPairs,$i); + } + } + } + + if (scalar (@badPairs)>0) { + #print join("\t",@badPairs).": ".join("\t",@t)."\n"; + #remove these inconsistant links + $t->[6] -= scalar(@badPairs); #numberOfPairs + return if ($t->[6] < $nb_pairs_threshold); + + $t->[7] = mark_values(\@badPairs, $t->[7]); + $t->[8] = mark_values(\@badPairs, $t->[8]); + $t->[9] = mark_values(\@badPairs, $t->[9]); + $t->[10] = mark_values(\@badPairs, $t->[10]); + $t->[11] = mark_values(\@badPairs, $t->[11]); + + $t->[12] = mark_indexes(\@badPairs, $t->[12]); + $t->[13] = mark_indexes(\@badPairs, $t->[13]); + + $t->[14] = mark_values(\@badPairs, $t->[14]); + $t->[15] = mark_values(\@badPairs, $t->[15]); + $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio + $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering); + ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length); + ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length); + + #recalc breakpoints: + my $quant001 = 3.090232; + my $maxFragmentLength = &floor($quant001 * $sigma + $mu); + $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); + $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); + #recalc total ratio + $t->[22] = $t->[6] / $t->[23] if ($order_filtering); + $t->[18] = $t->[6] / $t->[19] unless ($order_filtering); + + @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14])); + @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15])); + + $meanDistance = 0; + + for my $i (0..scalar(@positions1)-1) { + $meanDistance += $positions2[$i]-$positions1[$i]; + } + $meanDistance /= scalar(@positions1); + + } else { + $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering); + $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering); + + } #nothing has been filtered + return $meanDistance; +} + +sub recalculate_ratio { + my ($left, $ratio) = @_; + my @elements = split (/\//,$ratio); + $elements[1]= $elements[0]; + $elements[0]=$left; + return $ratio."\t".join("/",@elements); +} + +sub recalc_breakpoints { + my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_; + my $break_pont_chr; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + + my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); + + unless ($diff_sense_ends) { + $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; + } else { + $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; + } + return $break_pont_chr; +} +sub recalculate_boundaries { + my ($startString,$senseString,$endsOrderString,$tag_length) = @_; + my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); + my @strands = deleteBadOrderSensePairs(split (/,/,$senseString)); + my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString)); + my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length); + my $coord_start_cluster = min(min(@positions),min(@ends)); + my $coord_end_cluster = max(max(@positions),max(@ends)); + return ($coord_start_cluster,$coord_end_cluster); +} + +sub remove_indexes { + my ($bads, $string) = @_; + my @elements = deleteBadOrderSensePairs(split (/,/,$string)); + for my $i (reverse sort %{$bads}) { + delete $elements[$i]; + } + return "(".join(",",@elements).")"; +} +##add @ to to elements +sub mark_values { + my ($bads, $string) = @_; + my @elements = getAllEntries($string); + for my $i (@{$bads}) { + $elements[$i] .= "@"; + } + return "(".join(",",@elements).")"; +} +##add @ to to indexes +sub mark_indexes { + my ($bads, $string) = @_; + my @elements = getAllEntries($string); + for my $i ((0..scalar(@elements)-1)) { + for my $j (@{$bads}) { + $elements[$i] .= "@" if ($elements[$i] eq ($j+1)); + } + } + + return "(".join(",",@elements).")"; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub redraw { + + my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_; + + my $out; + my @first_arr; + if ($ifBalanced eq 'BAL') { + my @second_arr; + my $lastPushed = 1; + if ($type == 1) { + for my $i (0 .. scalar(@{$arr})-1) { + if (exists ($table->{$i})) { + push(@first_arr,$arr->[$i]); + $lastPushed = 1; + }elsif (exists ($secondTable->{$i})) { + push(@second_arr,$arr->[$i]); + $lastPushed = 2; + } elsif ($lastPushed == 1) { + if (exists ($badInFRSense->{$i})) { + push(@first_arr,$arr->[$i]."\$"); + }else { + push(@first_arr,$arr->[$i]."*"); + } + } elsif ($lastPushed == 2) { + if (exists ($badInFRSense->{$i})) { + push(@second_arr,$arr->[$i]."\$"); + }else { + push(@second_arr,$arr->[$i]."*"); + } + } else {print "Error!";exit;} + } + } else { + for my $i (@{$arr}) { + if (exists ($table->{$i-1})) { + push(@first_arr,$i); + $lastPushed = 1; + }elsif (exists ($secondTable->{$i-1})) { + push(@second_arr,$i); + $lastPushed = 2; + } elsif ($lastPushed == 1) { + if (exists ($badInFRSense->{$i-1})) { + push(@first_arr,$i."\$"); + }else { + push(@first_arr,$i."*"); + } + } elsif ($lastPushed == 2) { + if (exists ($badInFRSense->{$i-1})) { + push(@second_arr,$i."\$"); + }else { + push(@second_arr,$i."*"); + } + } else {print "Error!";exit;} + } + } + $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')'; + } + else { + if ($type == 1) { + for my $i (0 .. scalar(@{$arr})-1) { + if (exists ($table->{$i})) { + push(@first_arr,$arr->[$i]); + } else { + if (exists ($badInFRSense->{$i})) { + push(@first_arr,$arr->[$i]."\$"); + }else { + push(@first_arr,$arr->[$i]."*"); + } + } + } + } else { + for my $i (@{$arr}) { + if (exists ($table->{$i-1})) { + push(@first_arr,$i); + } else { + if (exists ($badInFRSense->{$i-1})) { + push(@first_arr,$i."\$"); + }else { + push(@first_arr,$i."*"); + } + } + } + } + $out = '('.join(",",@first_arr).')'; + } + return $out; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub check { + + my $table = $_[0]; + my $bad = 'OK'; + my $max = 0; + for my $i (sort {$a<=>$b} keys %{$table}) { + unless ($table->{$i}->{nonAdeq} == 0) { + if ($max<$table->{$i}->{nonAdeq}) { + $max=$table->{$i}->{nonAdeq}; + $bad = $i; + } + } + } + return $bad; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub reversed { + + my ($i,$j,$ifRenv,$positions) = @_; + if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){ + return 1; + } + return 0; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub remove { + + my ($bad,$table) = @_; + for my $i (sort {$a<=>$b} keys %{$table}) { + if ($bad == $i) { + delete($table->{$i});; + } else { + if (exists($table->{$i}->{$bad})) { + delete($table->{$i}->{$bad}); + $table->{$i}->{nonAdeq}--; + } + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs + + my ($strand,$ends_order,$mate_sense,@keysLeft) = @_; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my (@standardArray); + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @standardArray = translateSolidToRF($strand,$ends_order); + } else { + @standardArray = @{$strand}; + } + + my $ifR = 0; + my @Rs; + + for my $i (@keysLeft) { + if ($standardArray[$i] eq $leftLetterOk) { + $ifR++; + push(@Rs,$i); + } + } + + + my $ifF = 0; + my @Fs; + + for my $i (@keysLeft) { + if ($standardArray[$i] eq $rightLetterOk) { + $ifF++; + push(@Fs,$i); + } + } + + if($ifR>=$ifF) { + return @Fs; + } + return @Rs; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa + + my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]); + my $mate_sense = $_[6]; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my (@standardArray1,@standardArray2); + + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @standardArray1 = translateSolidToRF($strand1,$ends_order1); + my @arr = getOrderedStrands($strand2,$order2); + my @ends2 = getOrderedStrands($ends_order2,$order2); + @standardArray2 = translateSolidToRF(\@arr,\@ends2); + + } else { + @standardArray1 = @{$strand1}; + @standardArray2 = getOrderedStrands($strand2,$order2); + } + + #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF + + #for the first end: + + my @array = @standardArray1; + my %badInFRSense1; + for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense1{$i}=1; + $array[$i] = $rightLetterOk; + } + } + my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1); + @array = @standardArray1; + my %badInFRSense0; + for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense0{$i-1}=1; + $array[$i-1] = $leftLetterOk; + + } + } + my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0); + + #for the second end: + @array = @standardArray2; + + my %badInFRSense3; + for my $i (1..scalar(@array)-1){ + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense3{$order2->[$i]}=1; + $array[$i] = $rightLetterOk; + } + } + my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3); + + @array = @standardArray2; + my %badInFRSense5; + for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense5{$i-1}=1; + $array[$i-1] = $leftLetterOk; + } + } + my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5); + + if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) { + return (1,%badInFRSense0); + } + + if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) { + return (1,%badInFRSense1); + } + + if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) { + return (2,%badInFRSense3); + } + + if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) { + return (2,%badInFRSense5); + } + + #should not get here: + print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n"; + return (1,%badInFRSense1); +} + +sub getOrderedStrands { + my ($strand,$order) = ($_[0],$_[1]); + my @arr; + for my $i (0..scalar(@{$strand})-1) { + push(@arr,$strand->[$order->[$i]-1]); + } + return @arr; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub checkClusters { + + my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_; + if ($ifRenv eq 'REVERSE_SENSE') { + if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { + return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; + } + return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; + } + #if NORM + if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { + return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; + } + return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub translateSolidToRF { + my ($strandArr,$ends_orderArr)=@_; + my @array; + for my $i (0..scalar(@{$strandArr})-1) { + if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') { + push(@array,'F'); + } + if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') { + push(@array,'R'); + } + if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') { + push(@array,'R'); + } + if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') { + push(@array,'F'); + } + } + return @array; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the circos format +sub links2segdup{ + + my($id,$color_code,$links_file,$segdup_file)=@_; + + print LOG "# Converting to the circos format...\n"; + + tie (my %hcolor,'Tie::IxHash'); #color-code hash table + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + } + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; + + my $index=1; + while(<LINKS>){ + + my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; + + my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links + + print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output + "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; + $index++; + } + + close LINKS; + close SEGDUP; + print LOG "-- output created: $segdup_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bedPE format for BEDTools usage +sub links2bedPElinksfile{ + + my ($sample,$links_file,$bedpe_file)=@_; + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; + + my $nb_links=1; + + while(<LINKS>){ + + chomp; + my @t=split("\t",$_); + my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); + my $type=($chr1 eq $chr2)? "INTRA":"INTER"; + $type.="_".$t[10]; + + $start1--; $start2--; + + print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". + "\t$sample"."_link$nb_links\t$type\t.\t.". + "\t".join("|",@t)."\n"; + + $nb_links++; + } + + close LINKS; + close BEDPE; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub bedPElinks2linksfile{ + + my ($bedpe_file,$links_file)=@_; + + open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; + open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; + + while(<BEDPE>){ + + chomp; + my $sample=(split("_",(split("\t",$_))[6]))[0]; + my @t1=(split("\t",$_))[0,1,2,3,4,5]; + my @t2=split(/\|/,(split("\t",$_))[10]); + push(@t2,$sample); + + print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; + + } + close BEDPE; + close LINKS; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bed format +sub links2bedfile{ + + my ($tag_length,$color_code,$links_file,$bed_file)=@_; + + print LOG "# Converting to the bed format...\n"; + + my $compare=1; + if($links_file!~/compared$/){ + $compare=0; + $tag_length->{none}->{1}=$tag_length->{1}; + $tag_length->{none}->{2}=$tag_length->{2}; + } + + #color-code hash table + tie (my %hcolor,'Tie::IxHash'); + my %color_order; + my $n=1; + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + $color_order{$col}=$n; + $n++; + } + + my %pair; + my %pt; + $n=1; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + + my %str=( "F"=>"+", "R"=>"-" ); + + while(<LINKS>){ + + my @t=split; + my $sample=($compare)? pop(@t):"none"; + + my $chr1=$t[0]; + my $chr2=$t[3]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $same_chr=($chr1 eq $chr2)? 1:0; + + my $count=$t[6]; + my $color=getColor($count,\%hcolor,"bed"); + + my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); + my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); + my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); + my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); + my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); + my @position1=deleteBadOrderSensePairs(split(",",$t[14])); + my @position2=deleteBadOrderSensePairs(split(",",$t[15])); + my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); + my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); + + + for my $p (0..$#pairs){ + + if (!exists $pair{$pairs[$p]}){ + + if($same_chr){ + + $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, + $start1[$p]-1, $end2[$p], $color, + 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; + $pt{$n}=$pair{$pairs[$p]}->{0}; + $n++; + + }else{ + + $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, + $start1[$p]-1, $end1[$p], $color, + 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{1}; + $n++; + + + $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, + $start2[$p]-1, $end2[$p], $color, + 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{2}; + $n++; + } + }else{ + + if($same_chr){ + ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); + }else{ + ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); + ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); + } + } + } + } + close LINKS; + + my $nb_pairs=$n-1; + + open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; + print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". + "visibility=2 itemRgb=\"On\"\n"; + + for my $i (1..$nb_pairs){ + print BED join("\t",@{$pt{$i}})."\n"; + } + + close BED; + + print LOG "-- output created: $bed_file\n"; + + undef %pair; + undef %pt; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub deleteBadOrderSensePairs{ + + my (@tab)=@_; + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)]//g; + push(@tab2,$v) if($v!~/[\$\*\@]$/); + } + return @tab2; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getAllEntries{ + my (@tab)=split (/,/,$_[0]); + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)]//g; + push(@tab2,$v); + } + return @tab2; +}#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getAllEntriesWOspecialChar{ + my (@tab)=split (/,/,$_[0]); + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)\$\*\@]//g; + push(@tab2,$v); + } + return @tab2; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub links2SVfile{ + + my($links_file,$sv_file)=@_; + + print LOG "# Converting to the sv output table...\n"; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist + chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering + final_score breakpoint1_start1-end1 breakpoint2_start2-end2); + + my $nb_links=0; + + while (<LINKS>){ + + my @t=split; + my @sv=(); + my $sv_type="-"; + my $strand_ratio="-"; + my $eq_ratio="-"; + my $eq_type="-"; + my $insert_ratio="-"; + my $link="-"; + my ($bk1, $bk2)=("-","-"); + my $score="-"; + + my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); + my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); + my $nb_pairs=$t[6]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; + + #if strand filtering + if (defined $t[16]){ + #if inter-chr link + $sv_type=$t[16]; + if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ + $strand_ratio=floor($1/$2*100)."%"; + $score=$t[18]; + } + if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ + #if intra-chr link with insert size filtering + $strand_ratio=floor($1/$2*100)."%"; + $link=floor($t[17]); + if($sv_type!~/^INV/){ + $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + $score=$t[20]; + }else{ + $score=$t[19]; + } + } + } + + if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ + + #if strand and order filtering only and/or interchr link + $eq_type=$t[18]; + $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + ($bk1, $bk2)=($t[20],$t[21]); + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + $score=$t[22]; + + }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ + + #if all three filtering + $link=floor($t[17]); + $eq_type=$t[19]; + $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); + + if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ + $insert_ratio=floor($1/$2*100)."%"; + ($bk1, $bk2)=($t[22],$t[23]); + $score=$t[24]; + + }else{ + ($bk1, $bk2)=($t[21],$t[22]); + $score=$t[23]; + } + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + + } + + + push(@sv, $chr_type, $sv_type,$eq_type); + push(@sv,"$chr1\t$start1-$end1"); + push(@sv, $link); + push(@sv,"$chr2\t$start2-$end2", + $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); + + + print SV join("\t",@sv)."\n"; + } + + close LINKS; + close SV; + + system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; + + open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; + my @links=<SV>; + close SV; + + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + print SV join("\t",@header)."\n"; + print SV @links; + close SV; + + unlink($sv_file.".sorted"); + + print LOG "-- output created: $sv_file\n"; + +} +#------------------------------------------------------------------------------# +sub densityCalculation{ + + my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_; + + my @sfile=split(/\./,$$mates_file[$file]); + my $fchr=$sfile[$#sfile]; + + my $fh = new FileHandle; + + my %density; + my %density_ref; + my @ratio; + my ($cov,$cov_ref); + + #FREQUENCY CALCULATION PROCEDURE + print LOG "# $fchr : Frequency calculation procedure...\n"; + &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format); + &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format); + + #MAKING RATIO AND OUTPUT + print LOG "\# Ratio calculation procedure...\n"; + $density_file=~s/\/mates\//\/density\//; + $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n"; + + foreach my $k (1..$chr->{nb_chrs}){ + foreach my $frag (1..$chr->{$k}->{nb_frag}){ + + @ratio= ($chr->{$k}->{name}, + (${$chr->{$k}->{$frag}}[0]+1), + (${$chr->{$k}->{$frag}}[1]+1)); + + $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0; + $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0; + + push(@ratio,$cov,$cov_ref); + push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref); + push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref); + push(@ratio,log($cov+1)) if($cov && !$cov_ref); + next if(!$cov && !$cov_ref); + + print $fh join("\t",@ratio)."\n"; + } + } + + $fh->close; + print LOG "-- output created: $density_file\n"; + + undef %density; + undef %density_ref; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub FreqCalculation{ + + my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_; + + my @sfile=split(/\./,$mates_file); + my $fchr=$sfile[$#sfile]; + my $fh = new FileHandle; + + my $nb_windows=0; + my $warn=100000; + my $record=0; + my %pair; + + my ($sumX,$sumX2) = (0,0); + + print LOG "\# Frequency calculation for $mates_file...\n"; + + if ($mates_file =~ /.gz$/) { + $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; + }elsif($mates_file =~ /.bam$/){ + o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; + } + + while(<$fh>){ + + my @t=split; + my $mate=$t[0]; + + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,); + + next if(exists $pair{$mate}); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2}); + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + $pair{$mate}=undef; + $record++; + + my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2); + + recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); + recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); + + my $length = abs($coord_start_read1-$coord_start_read2); + $sumX += $length; #add to sum and sum^2 for mean and variance calculation + $sumX2 += $length*$length; + + for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ + + if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ + + &addToDensity($density,$chr_read1,$i,\$nb_windows) + if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])); + + }else{ + + $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); + } + } + + if($record>=$warn){ + print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n"; + $warn+=100000; + } + } + $fh->close; + + print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n"; + + if($record>0){ + + my $mu = $sumX/$record; + my $sigma = sqrt($sumX2/$record - $mu*$mu); + print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n"; + } + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub ratio2segdup{ + + my($id,$density_file,$segdup_file)=@_; + + print LOG "# Converting to circos format...\n"; + + open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; + open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; + + while(<RATIO>){ + chomp; + my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; + print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n"; + } + + close RATIO; + close SEGDUP; + print LOG "-- output created: $segdup_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub ratio2bedfile{ + + my($density_file,$bed_file)=@_; + + print LOG "# Converting to bedGraph format...\n"; + + open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; + open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; + print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ". + "visibility=2 color=255,0,0 alwaysZero=\"On\"\n"; + + while(<RATIO>){ + chomp; + my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/); + print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n"; + } + + close RATIO; + close BED; + print LOG "-- output created: $bed_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub inverseSense{ + + my $mate_sense=$_[0]; + my %reverse=( 'F' => 'R' , 'R' => 'F' , + 'FF' => 'RR', 'RR' => 'FF', + 'FR' => 'RF', 'RF' => 'FR'); + return $reverse{$mate_sense}; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getNextFrag{ + + my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_; + + my $how_far = $read_start-$frag_start; + my $nb_windows_toskip; + + if($how_far>0){ + + $nb_windows_toskip=($how_far/$step)-($window_dist/$step); + $nb_windows_toskip=~ s/\..*//; + $nb_windows_toskip=0 if($nb_windows_toskip<0); + return ($frag_num + $nb_windows_toskip); + } + return $frag_last; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getColor{ + + my($count,$hcolor,$format)=@_; + for my $col ( keys % { $hcolor} ) { + return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); + } + return "white" if($format eq "circos"); + return "255,255,255" if($format eq "bed"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub recupCoords{ + + my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_; + my $strand = 'F'; + + if ($c_hit=~s/^\-//) { + $strand='R'; + $$cs_hit=$c_hit; + $$ce_hit=$c_hit-($tag_length-1); + }else{ + $$cs_hit=$c_hit; + $$ce_hit=$c_hit+($tag_length-1); + } + return $strand; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub overlap { + my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_; + if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) { + return 0; + } + return 1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub makeLink { + + my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_; + + if($chr1>$chr2){ + ($chr1,$chr2)= ($chr2,$chr1); + ($frag1,$frag2)= ($frag2,$frag1); + } + + if($chr1 == $chr2){ + if($frag1>$frag2){ + ($frag1,$frag2)= ($frag2,$frag1); + } + } + + if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){ + $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt; + $$nb++; + }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){ + $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt"; + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#fonction of adding the read to the density profile +sub addToDensity { + + my ($density,$chr1,$frag1,$nb)=@_; + + if(!exists $density->{$chr1}->{$frag1}->{count}){ + $density->{$chr1}->{$frag1}->{count}=1; + $$nb++; + }else{ + $density->{$chr1}->{$frag1}->{count}++; + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub floor { + my $nb = $_[0]; + $nb=~ s/\..*//; + return $nb; +} +#------------------------------------------------------------------------------# +sub decimal{ + + my $num=shift; + my $digs_to_cut=shift; + + $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); + + return $num; +} + +#------------------------------------------------------------------------------# +sub max { + + my($max) = shift(@_); + foreach my $temp (@_) { + $max = $temp if $temp > $max; + } + return($max); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub min { + + my($min) = shift(@_); + foreach my $temp (@_) { + $min = $temp if $temp < $min; + } + return($min); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub sortTablebyIndex{ + my ($tab1,$tab2)=@_; + my @tab3; + + foreach my $i (@$tab1){ + $tab3[$i]=$$tab2[$$tab1[$i]]; + } + return @tab3; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub round { + my $number = shift || 0; + my $dec = 10 ** (shift || 0); + return int( $dec * $number + .5 * ($number <=> 0)) / $dec; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getUniqueTable{ + + my (@tab)=@_; + my (%saw,@out)=(); + undef %saw; + return sort(grep(!$saw{$_}++, @tab)); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub catFiles { + + unlink("$_[1]") if(exists $_[1]); + system qq( cat "$_" >> "$_[1]" ) for @{$_[0]}; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#check if the configuration file is correct +sub validateconfiguration{ + + my %conf=%{$_[0]}; + my $list_prgs="@ARGV"; + + my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file); + my @detection_params=qw(split_mate_file window_size step_length split_mate_file); + my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file); + my @circos_params=qw(organism_id colorcode); + my @bed_params=qw(colorcode); + my @compare_params=qw(list_samples file_suffix); + + foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){ + + unless (defined($dir)) { + $dir = "."; + } + unless (-d $dir){ + mkdir $dir or die; + } + $dir.="/" if($dir!~/\/$/); + } + + unless (defined($conf{general}{num_threads})) { + $conf{general}{num_threads} = 1; + } + $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24); + + if($list_prgs!~/links2compare/){ + + foreach my $p (@general_params){ + die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p}); + } + + $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam"); + + unless (defined($conf{general}{sv_type})) { + $conf{general}{sv_type} = "all"; + } + + $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}}; + } + + if($list_prgs=~/(linking|cnv)/){ + + foreach my $p (@detection_params){ + die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p}); + } + + die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref}); + + if($conf{detection}{step_length}>$conf{detection}{window_size}){ + die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n"); + } + + unless (-d $conf{general}{tmp_dir}."/mates"){ + mkdir $conf{general}{tmp_dir}."/mates" or die; + } + + if($list_prgs=~/linking/){ + unless (-d $conf{general}{tmp_dir}."/links"){ + mkdir $conf{general}{tmp_dir}."/links" or die; + } + } + if($list_prgs=~/cnv/){ + unless (-d $conf{general}{tmp_dir}."/density"){ + mkdir $conf{general}{tmp_dir}."/density" or die; + } + } + + } + + if($list_prgs=~/filtering/){ + + foreach my $p (@filtering_params) { + die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p}); + + } + + if(defined($conf{filtering}{chromosomes})) { + my @chrs=split(",",$conf{filtering}{chromosomes}); + my $exclude=($chrs[0]=~/^\-/)? 1:0; + for my $chrName (@chrs){ + + die("Error Config : The filtering parameter \"chromosomes\" is not valid\n") + if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude)); + + } + } + + if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) { + die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected". + "\nChange strand_filtering to \"1\" if you want to use the order filtering\n"); + } + if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) { + die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n"); + } + if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) { + die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected". + "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n"); + } + if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) { + die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n"); + } + + if (!defined($conf{filtering}{indel_sigma_threshold})) { + $conf{filtering}{indel_sigma_threshold} = 2; + } + if (!defined($conf{filtering}{dup_sigma_threshold})) { + $conf{filtering}{dup_sigma_threshold} = 2; + } + if (!defined($conf{filtering}{singleton_sigma_threshold})) { + $conf{filtering}{singleton_sigma_threshold} = 4; + } + + if (!defined($conf{filtering}{nb_pairs_order_threshold})) { + $conf{filtering}{nb_pairs_order_threshold} = 1; + } + + if (!defined($conf{filtering}{final_score_threshold})) { + $conf{filtering}{final_score_threshold} = 0.8; + } + + if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) { + die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n"); + } + + } + + if($list_prgs=~/2circos$/){ + foreach my $p (@circos_params) { + next if($list_prgs=~/^ratio/ && $p eq "colorcode"); + die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); + } + } + + if($list_prgs=~/2bed$/){ + foreach my $p (@bed_params) { + die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); + } + } + + if($list_prgs=~/links2compare/){ + foreach my $p (@compare_params) { + die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); + } + + unless (defined($conf{compare}{same_sv_type})) { + $conf{compare}{same_sv_type} = 0; + } + + unless (defined($conf{compare}{min_overlap})) { + $conf{compare}{min_overlap} = 1E-9; + } + + if($conf{compare}{circos_output}){ + foreach my $p (@circos_params) { + next if($list_prgs=~/^ratio/ && $p eq "colorcode"); + die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); + } + } + if($conf{compare}{bed_output}){ + foreach my $p (@bed_params) { + die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); + } + die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); + + my @samples=split(",",$conf{compare}{list_samples}); + my @read_lengths=split(",",$conf{compare}{list_read_lengths}); + for my $i (0..$#samples){ + my @l=split("-",$read_lengths[$i]); + $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; + } + } + } + + +} +#------------------------------------------------------------------------------# +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/SVDetect_run_parallel.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,324 @@ +<tool id="svdetect_run_parallel" name="Detect clusters of anomalously mapped pairs"> + +<description>and identify structural variants</description> + +<command interpreter="perl">SVDetect_run_parallel.pl + +#if $getLinks.linking == "linking" +linking +<!-- -out1 '$links_file' --> +#end if +#if $getFilteredLinks.filtering == "filtering" +filtering +<!--- out2 '$flinks_file' --> +#if str($getFilteredLinks.links2SV) == "create" +links2SV +-out3 '$sv_file' +#end if +#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2circos) == "create" +links2circos +-out4 '$circos_file' +#end if +#if $getFilteredLinks.file_conversion.file_conversion_select=="convert" and str($getFilteredLinks.file_conversion.links2bed) == "create" +links2bed +-out5 '$bed_file' +#end if +#end if +-conf '$config_file' +-l '$log_file' +-N '$sample_name' + +</command> + +<inputs> + <param name="sample_name" type="text" value="sample" label="Sample Name"/> + <param name="mates_file" format="bam" type="data" label="Input BAM file (.ab.bam)"/> + <param name="cmap_file" format="len" type="data" label="Chromosomes list file (.len)" help="Tabulated file format with Chromosome ID (integer from 1), name and length"/> + <param name="mates_orientation" type="select" format="txt" label="Type of sequencing technology and libraries"> + <option value="FR">Illumina paired-ends</option> + <option value="RF">Illumina mate-pairs</option> + <option value="FR">SOLiD paired-ends</option> + <option value="RR">SOLiD mate-pairs</option> + </param> + <param name="read1_length" type="integer" size="10" value="50" label="Read 1 length (bp)" help="Length of the first read in a pair (left read)"/> + <param name="read2_length" type="integer" size="10" value="50" label="Read 2 length (bp)" help="Length of the second read in a pair (right read)"/> + <param name="sv_type" type="select" format="txt" label="Type of SV to detect"> + <option value="all">all types of SVs</option> + <option value="intra">intrachromosomal SVs only</option> + <option value="inter">interchromosomal SVs only</option> + </param> + + <conditional name="getLinks"> + <param name="linking" type="select" label="Linking procedure" help="Detection and isolation of links"> + <option value="linking">Yes</option> + <option value="">No, already done</option> + </param> + <when value=""> + <!-- do nothing here --> + </when> + <when value="linking"> + <param name="splitmate" label="Do you want to split the original mate file per chromosome for parallel computing?" type="boolean" truevalue="split" falsevalue="do_not_split" checked="True" help="Untick it if already done"/> + <param name="window_size" type="integer" size="20" value="3000" label="Window size (bp)" help="Equal to at least “2µ+2√2σ"/> + <param name="step_length" type="integer" size="20" value="250" label="Step length size (bp)" help="Equal to 1/2 or 1/4 of the window size"/> + </when> + </conditional> + + <conditional name="getFilteredLinks"> + <param name="filtering" type="select" label="Filtering procedure" help="Filtering of links according different parameters and thresholds"> + <option value="filtering">Yes</option> + <option value="">No</option> + </param> + <when value=""> + <!-- do nothing here --> + </when> + <when value="filtering"> + + <param name="splitlink" label="Do you want to split the original link file per chromosome for parallel computing?" type="boolean" truevalue="split" falsevalue="do_not_split" checked="False" help="Untick it if (the linking is) already done"/> + <param name="chromosomes" type="text" size="20" label="List of chromosome names to keep or exclude"/> + <param name="nb_pairs_threshold" type="integer" size="20" value="5" label="Minimum number of pairs in a cluster"/> + + <conditional name="filter1"> + <param name="strand_filtering" type="select" label="Strand filtering procedure"> + <option value="strand">Yes</option> + <option value="">No</option> + </param> + <when value=""> + <!-- do nothing here --> + </when> + <when value="strand"> + + <conditional name="filter2"> + <param name="order_filtering" type="select" label="Order filtering procedure"> + <option value="order">Yes</option> + <option value="">No</option> + </param> + <when value=""> + <!-- do nothing here --> + </when> + <when value="order"> + + <conditional name="filter3"> + <param name="insert_size_filtering" type="select" label="Insert-size filtering procedure"> + <option value="insert">Yes</option> + <option value="">No</option> + </param> + <when value=""> + <!-- do nothing here --> + </when> + <when value="insert"> + <param name="indel_sigma_threshold" type="float" size="20" value="3" label="Minimal number of sigma fold for the insert size filtering and to call insertions and deletions"/> + <param name="dup_sigma_threshold" type="float" size="20" value="3" label="minimal number of sigma fold for the insert size filtering to call tandem duplications"/> + <param name="singleton_sigma_threshold" type="float" size="20" value="4" label="Minimal number of sigma fold for the insert size filtering to call singletons" help="for Illumina mate-pairs only"/> + </when> + </conditional> + + <param name="mu_length" type="integer" size="20" value="3000" label="Mean insert size value (µ) of normally mapped mate-pairs, in bp"/> + <param name="sigma_length" type="integer" size="20" value="250" label="Calculated sd value (σ) from the distribution of normally mapped mate-pairs, in bp"/> + <param name="nb_pairs_order_threshold" type="integer" size="20" value="2" label="Minimal number of pairs in a subgroup of paired-end reads for balanced events"/> + </when> + </conditional> + + <param name="final_score_threshold" type="float" size="20" value="1.0" label="Minimal final filtering score for calling SVs" help="A value of 1 means all the pairs in a cluster were consistent between each other after applying filters"/> + </when> + </conditional> + + <param name="links2SV" label="Do you want to have filtered links in a tabulated file format showing significant SVs?" type="boolean" truevalue="create" falsevalue="do_not_create" checked="True"/> + + <conditional name="file_conversion"> + <param name="file_conversion_select" type="select" label="Output file conversion" help="Converts filtered links to Circos/BED files format for graphical view of SVs"> + <option value="do_not_convert">No</option> + <option value="convert">Yes</option> + </param> + <when value="do_not_convert"> + <!-- do nothing here --> + </when> + <when value="convert"> + <param name="links2circos" label="Converts the link list to the Circos link format" type="boolean" truevalue="create" falsevalue="do_not_create" checked="True"/> + <param name="links2bed" label="Converts the link list to the UCSC BED format" type="boolean" truevalue="create" falsevalue="do_not_create" checked="False"/> + <param name="organism_id" type="text" size="10" value="hs" label="Organism ID"/> + <repeat name="color_code" title="Color-code" min="1" max="7"> + <param name="color" type="select" label="Color"> + <option value="grey">grey</option> + <option value="black">black</option> + <option value="blue">blue</option> + <option value="green">green</option> + <option value="purple">purple</option> + <option value="orange">orange</option> + <option value="red">red</option> + </param> + <param name="interval" type="text" value="1,3" label="Interval"/> + </repeat> + </when> + </conditional> + </when> + </conditional> +</inputs> + + +<outputs> + <!--<data format="txt" name="links_file" label="svdetect.links"> + <filter>getLinks['linking']=="linking"</filter> + </data> + <data format="txt" name="flinks_file" label="svdetect.links.filtered"> + <filter>getFilteredLinks['filtering']=="filtering"</filter> + </data>--> + <data format="sv" name="sv_file" label="${sample_name}.sv"> + <filter>( + getFilteredLinks['filtering']=="filtering" and + getFilteredLinks['links2SV'] is True + ) + </filter> + </data> + <data format="segdup" name="circos_file" label="${sample_name}.segdup"> + <filter>( + getFilteredLinks['filtering']=="filtering" and + getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and + getFilteredLinks['file_conversion']['links2circos'] is True + ) + </filter> + </data> + <data format="bed" name="bed_file" label="${sample_name}.bed"> + <filter>( + getFilteredLinks['filtering']=="filtering" and + getFilteredLinks['file_conversion']['file_conversion_select']=="convert" and + getFilteredLinks['file_conversion']['links2bed'] is True + ) + </filter> + </data> + <data format="txt" name="log_file" label="${sample_name}.svdetect_run.log"/> +</outputs> + + + +<configfiles> + <configfile name="config_file"> +<general> +input_format = bam +sv_type = ${sv_type} +mates_orientation=${mates_orientation} +read1_length=${read1_length} +read2_length=${read2_length} +mates_file=${mates_file} +cmap_file=${cmap_file} +tmp_dir=$__new_file_path__/svdetect/tmp +output_dir=$__new_file_path__/svdetect +num_threads=8 +</general> + +#if $getLinks.linking == "linking" +<detection> +#if str($getLinks.splitmate) == "split" +split_mate_file=1 +#else +split_mate_file=0 +#end if +window_size=${getLinks.window_size} +step_length=${getLinks.step_length} +</detection> +#end if + +#if $getFilteredLinks.filtering == "filtering" +<filtering> +#if str($getFilteredLinks.splitlink) == "split" +split_link_file=1 +#else +split_link_file=0 +#end if +#if str($getFilteredLinks.chromosomes) != "" +chromosomes=${getFilteredLinks.chromosomes} +#end if +nb_pairs_threshold=${getFilteredLinks.nb_pairs_threshold} +#if $getFilteredLinks.filter1.strand_filtering == "strand" +strand_filtering=1 +final_score_threshold=${getFilteredLinks.filter1.final_score_threshold} +#if $getFilteredLinks.filter1.filter2.order_filtering == "order" +order_filtering=1 +mu_length=${getFilteredLinks.filter1.filter2.mu_length} +sigma_length=${getFilteredLinks.filter1.filter2.sigma_length} +nb_pairs_order_threshold=${getFilteredLinks.filter1.filter2.nb_pairs_order_threshold} +#if $getFilteredLinks.filter1.filter2.filter3.insert_size_filtering == "insert" +insert_size_filtering=1 +indel_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.indel_sigma_threshold} +dup_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.dup_sigma_threshold} +singleton_sigma_threshold=${getFilteredLinks.filter1.filter2.filter3.singleton_sigma_threshold} +#else +insert_size_filtering=0 +#end if +#else +order_filtering=0 +#end if +#else +strand_filtering=0 +#end if +</filtering> +#end if + +#if $getFilteredLinks.filtering == "filtering" +#if $getFilteredLinks.file_conversion.file_conversion_select == "convert" +#if str($getFilteredLinks.file_conversion.links2circos) == "create" +<circos> +organism_id=${getFilteredLinks.file_conversion.organism_id} +<colorcode> +#for $color_repeat in $getFilteredLinks.file_conversion.color_code +${color_repeat.color}=${color_repeat.interval} +#end for +</colorcode> +</circos> +#end if +#if str($getFilteredLinks.file_conversion.links2bed) == "create" +<bed> +<colorcode> +#for $color_repeat in $getFilteredLinks.file_conversion.color_code +#if str($color_repeat.color)== "grey" +190,190,190=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "black" +0,0,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "blue" +0,0,255=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "green" +0,255,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "purple" +153,50,205=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "orange" +255,140,0=${color_repeat.interval} +#end if +#if str($color_repeat.color)== "red" +255,0,0=${color_repeat.interval} +#end if +#end for +</colorcode> +</bed> +#end if +#end if +#end if + </configfile> +</configfiles> + + <help> +**What it does** + +SVDetect - Version : 0.8b + +Parallel version (nCPU=8) + +SVDetect is a application for the isolation and the type prediction of intra- and inter-chromosomal rearrangements from paired-end/mate-pair sequencing data provided by the high-throughput sequencing technologies + +This tool aims to identifying structural variations (SVs) with both clustering and sliding-window strategies, and helping in their visualization at the genome scale. +SVDetect is compatible with SOLiD and Illumina (>=1.3) reads. + +Manual documentation available at the http://svdetect.sourceforge.net/Site/Manual.html + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of SVDetect. + + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/svdetect/circos_graph.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,290 @@ +<tool id="circos_graph" name="Circos" version="1.1.0"> + +<description>plots</description> + +<command interpreter="perl">circos/bin/circos + +-conf '$circos_config_file' +-outputfile '${outputfile}.dat' +-png + +> '$log_file' + +; + +rm '$outputfile'; ln -s '${outputfile}.png' '$outputfile' + +</command> + + +<inputs> + <param name="graph_name" type="text" size="20" value="graph1" label="Graph name"/> + + <param name="karyotype" type="select" format="txt" label="Type of model organism"> + <option value="data/karyotype.human_hg19.txt">Human (homo sapiens, hs) -hg19-</option> + <option value="data/karyotype.human.txt">Human (homo sapiens, hs) -hg18-</option> + <option value="data/2/karyotype.mouse.txt">Mouse (Mus Musculus, mm)</option> + <option value="data/2/karyotype.dog.txt">Dog (Canis familiaris, cf)</option> + <option value="data/2/karyotype.rt.txt">Rat (Rattus norvegicus, rn)</option> + <option value="data/karyotype.yeast.txt">Yeast (Saccharomyces Cerevisiae, sc) -SGD-</option> + + </param> + <param name="chromosomes_units" type="integer" size="50" value="1000000" label="Chromosomes units"/> + <param name="chromosomes" type="text" size="100" value="" label="List of chromosome names to keep or exclude" help="ex: hs2;hs3 or -hsX;-hsY"> + <sanitizer> + <valid initial="string.printable"> + <add value=";"/> + </valid> + </sanitizer> + </param> + <param name="link_file" format="segdup" type="data" label="Input link file (.segdup)"/> +</inputs> + +<outputs> + <data format="txt" name="log_file" label="${graph_name}.circos.log"/> + <data format="png" name="outputfile" label="${graph_name}.png"/> +</outputs> + + + +<configfiles> + <configfile name="ideogram_config_file"> + +<ideogram> + +<spacing> + +default = 5u +break = 1u + +axis_break_at_edge = yes +axis_break = yes +axis_break_style = 2 + +<break_style 1> +stroke_color = black +fill_color = blue +thickness = 0.25r +stroke_thickness = 2 +</break> + +<break_style 2> +stroke_color = black +stroke_thickness = 3p +thickness = 1.5r +</break> + +</spacing> + +## thickness (px) of chromosome ideogram +thickness = 100p +stroke_thickness = 2 +## ideogram border color +stroke_color = black +fill = yes +## the default chromosome color is set here and any value +## defined in the karyotype file overrides it +fill_color = black + +## fractional radius position of chromosome ideogram within image +radius = 0.85r +show_label = yes +label_with_tag = yes +label_font = condensedbold +label_radius = dims(ideogram,radius) + 0.075r +label_size = 60p + +## cytogenetic bands +band_stroke_thickness = 2 + +## show_bands determines whether the outline of cytogenetic bands +## will be seen +show_bands = yes +## in order to fill the bands with the color defined in the karyotype +## file you must set fill_bands +fill_bands = yes + +</ideogram> + + </configfile> + + <configfile name="ticks_config_file"> + +show_ticks = yes +show_tick_labels = yes + +<ticks> +radius = dims(ideogram,radius_outer) +multiplier = 1e-6 + +<tick> +spacing = 0.5u +size = 2p +thickness = 2p +color = grey +show_label = no +label_size = 12p +label_offset = 0p +format = %.2f +</tick> + +<tick> +spacing = 1u +size = 3p +thickness = 2p +color = dgrey +show_label = no +label_size = 12p +label_offset = 0p +format = %.2f +</tick> + +<tick> +spacing = 5u +size = 5p +thickness = 2p +color = black +show_label = yes +label_size = 16p +label_offset = 0p +format = %d +</tick> + +<tick> +spacing = 10u +size = 8p +thickness = 2p +color = black +show_label = yes +label_size = 20p +label_offset = 5p +format = %d +</tick> +</ticks> + </configfile> + + + <configfile name="circos_config_file"> +<colors> +<<include etc/colors.conf>> +</colors> + +<fonts> +<<include etc/fonts.conf>> +</fonts> + +<<include $ideogram_config_file>> +<<include $ticks_config_file>> + +karyotype = $karyotype + +<image> +24bit = yes +##png = yes +##svg = no +## radius of inscribed circle in image +radius = 1500p +background = white +## by default angle=0 is at 3 o'clock position +angle_offset = -90 +#angle_orientation = counterclockwise + +auto_alpha_colors = yes +auto_alpha_steps = 5 +</image> + +chromosomes_units= $chromosomes_units + +#if str($chromosomes)=="" +chromosomes_display_default = yes +#else +chromosomes_display_default = no +chromosomes = $chromosomes +#end if + +<links> + +z = 0 +radius = 0.95r +bezier_radius = 0.2r + +<link segdup> +show = yes +color = dgrey_a5 +thickness = 2 +file = $link_file +record_limit = 1000 +</link> + +</links> + + +anglestep = 0.5 +minslicestep = 10 +beziersamples = 40 +debug = no +warnings = no +imagemap = no + +units_ok = bupr +units_nounit = n + </configfile> +</configfiles> + + <help> +**What it does** + +Circos + +Manual documentation available at the http://circos.ca/ + + +**Example of link segdup file** + +segdup file:: + + 1 hs1 1077096 1078746 color=red + 1 hs1 1080923 1082805 color=red + 2 hs1 1137684 1137961 color=red + 2 hs3 1138138 1138423 color=red + 3 hs11 1169417 1170000 color=red + 3 hs11 1170025 1170975 color=red + 4 hs11 1222480 1224271 color=green + 4 hs11 1223328 1225675 color=green + 5 hs12 1223336 1225812 color=grey + 5 hs13 1224709 1227633 color=grey + 6 hs11 1223621 1226460 color=red + 6 hs11 1224918 1227633 color=red + 7 hs11 1399510 1401513 color=white + 7 hs11 1401628 1403697 color=white + 8 hs15 1652045 1653746 color=red + 8 hs15 1657167 1658940 color=red + 9 hs11 165333 165887 color=white + 9 hs11 165981 168016 color=white + 10 hs11 1702700 1702841 color=red + 10 hs11 1702903 1703057 color=red + 11 hs11 1912272 1915186 color=white + 11 hs11 1937111 1939824 color=white + 12 hs11 1983211 1983355 color=red + 12 hs11 1983591 1983748 color=red + 13 hs11 2913657 2913898 color=white + 13 hs11 2914048 2914341 color=white + 14 hs11 3090593 3090749 color=purple + 14 hs11 3090709 3090864 color=purple + 15 hs21 3466365 3466434 color=red + 15 hs21 3466554 3466620 color=red + 16 hsX 3603073 3603321 color=white + 16 hsX 3603295 3603520 color=white + + + +----- + +.. class:: infomark + +Contact Bruno Zeitouni (svdetect@curie.fr) for any questions or concerns about the Galaxy implementation of Circos. + + + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/varscan/tool_dependencies.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="VarScan" version="2.3.5"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://downloads.sourceforge.net/project/varscan/VarScan.v2.3.5.jar</action> + <action type="move_file"> + <source>VarScan.v2.3.5.jar</source> + <destination>$INSTALL_DIR/jars</destination> + </action> + <action type="set_environment"> + <environment_variable name="JAVA_JAR_PATH" action="set_to">$INSTALL_DIR/jars</environment_variable> + </action> + </actions> + </install> + <readme> + </readme> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/varscan/varscan_mpileup.pl Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,116 @@ +#!/usr/bin/perl + +use strict; +use Cwd; + +die qq( +Bad numbr of inputs + +) if(!@ARGV); + +my $options =""; +my $file=""; +my $command=""; +my $output=""; +my $working_dir = cwd(); +my $temp_vcf = "$working_dir/temp"; +my $log=""; + +foreach my $input (@ARGV) +{ + my @tmp = split "::", $input; + if($tmp[0] eq "COMMAND") + { + $command = $tmp[1]; + } + elsif($tmp[0] eq "INPUT") + { + $file = $tmp[1]; + } + elsif($tmp[0] eq "OPTION") + { + $options = "$options ${tmp[1]}"; + } + elsif($tmp[0] eq "OUTPUT") + { + $output = $tmp[1]; + } + elsif($tmp[0] eq "LOG") + { + $log = $tmp[1]; + } + else + { + die("Unknown Input: $input\n"); + } +} + +system ("$command $file $options 1>$temp_vcf 2>$log"); + +vs2vcf($temp_vcf, $output); + + +sub vs2vcf +{ + + # + # G l o b a l v a r i a b l e s + # + my $version = '0.1'; + + # + # Read in file + # + my $input = shift; + my $output = shift; + my $chr_ord = shift; + open(IN, $input) or die "Can't open $input': $!\n"; + open(OUT, ">$output") or die "Can't create $output': $!\n"; + my %output; + + while ( <IN> ) + { + if ( /^#/ ) + { + print OUT; + next; + } + chomp; + my $line = $_; + + my @flds = split ( "\t", $line ); + my $ref = $flds[3]; + my $alt = $flds[4]; + # + # Deletion of bases + # + if ( $alt =~ /^\-/ ) + { + ($flds[3], $flds[4]) = ($ref.substr($alt,1), $ref); + } + + # + # Insertion of bases + # + if ( $alt =~ /^\+/ ) + { + $flds[4] = $ref.substr($alt,1); + } + print OUT join( "\t", @flds),"\n" unless defined $chr_ord; + $output{$flds[0]}{$flds[1]} = join( "\t", @flds)."\n" if defined $chr_ord; + } + close(IN); + # if chromosome order given return in sorted order + if(defined $chr_ord) + { + for my $chrom (@{ $chr_ord }) + { + for my $pos (sort {$a<=>$b} keys %{ $output{$chrom} }) + { + print OUT $output{$chrom}{$pos}; + } + } + } + close(OUT); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/varscan/varscan_mpileup.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,124 @@ +<tool id="varscan_mpileup" name="VarScan mpileup" version="2.3.5"> + <description> + mutation caller for targeted, exome, and whole-genome resequencing + </description> + <requirements> + <requirement type="package" version="2.3.5">VarScan</requirement> + </requirements> + <command interpreter="perl"> + + varscan_mpileup.pl + "COMMAND::java -jar \$JAVA_JAR_PATH/VarScan.v2.3.5.jar $exe_command" + "INPUT::$in_file" + "OUTPUT::$output" + "LOG::$log" + "OPTION::--min-coverage $min_coverage" + "OPTION::--min-reads2 $min_reads2" + "OPTION::--min-avg-qual $min_avg_qual" + "OPTION::--min-var-freq $min_var_freq" + "OPTION::--min-freq-for-hom $min_freq_for_hom" + "OPTION::--p-value $p_value" + "OPTION::--strand-filter $strand_filter" + "OPTION::--output-vcf 1" + + #if ($vcf_sample_list): + "OPTION::--vcf-sample-list $vcf_sample_list" + #end if + "OPTION::--variants $variants" + + + + </command> + + <inputs> + + <param name="exe_command" type="select" label="Command" help="" optional="false"> + <option value="mpileup2snp" >mpileup2snp</option> + <option value="mpileup2indel">mpileup2indel</option> + <option value="mpileup2cns">mpileup2cns</option> + </param> + <param name="in_file" type="data" format="pileup" label="mpileup file" help="The SAMtools mpileup file" /> + <param name="min_coverage" type="integer" label="min-coverage" help="" optional="true" value="8"/> + <param name="min_reads2" type="integer" label="min-reads2" help="" optional="true" value="2"/> + <param name="min_avg_qual" type="integer" label="min-avg-qual" help="" optional="true" value="15"/> + <param name="min_var_freq" type="float" label="min-var-freq" help="" optional="true" value="0.01"/> + <param name="min_freq_for_hom" type="float" label="min-freq-for-hom" help="" optional="true" value="0.75"/> + <param name="p_value" type="text" label="p-value" help="" optional="true" value="0.99"/> + <param name="strand_filter" type="integer" label="strand-filter" help="" optional="true" value="1"/> + <param name="vcf_sample_list" type="data" label="vcf-sample-list" format="txt" help="" optional="true" /> + <param name="variants" type="integer" label="variants" help="Set to 1 to report only variants" optional="true" value="1"/> + + + </inputs> + <outputs> + <data type="data" format="vcf" name="output" label="${tool.name} result on ${on_string}"/> + <data type="data" format="txt" name="log" label="${tool.name} result on ${on_string} (log) "/> + </outputs> + + <help> + +.. class:: infomark + +**What it does** + +:: + + VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. It can be used to detect different types of variation: + + Germline variants (SNPs an dindels) in individual samples or pools of samples. + Multi-sample variants (shared or private) in multi-sample datasets (with mpileup). + Somatic mutations, LOH events, and germline variants in tumor-normal pairs. + Somatic copy number alterations (CNAs) in tumor-normal exome data. + + +**Input** + +:: + + mpileup file - The SAMtools mpileup file + + +**Parameters** + +:: + + commands + mpileup2snp Identify SNPs from an mpileup file + mpileup2indel Identify indels an mpileup file + mpileup2cns Call consensus and variants from an mpileup file + + min-coverage + Minimum read depth at a position to make a call [8] + + min-reads2 + Minimum supporting reads at a position to call variants [2] + + min-avg-qual + Minimum base quality at a position to count a read [15] + + min-var-freq + Minimum variant allele frequency threshold [0.01] + + min-freq-for-hom + Minimum frequency to call homozygote [0.75] + + p-value + Default p-value threshold for calling variants [99e-02] + + strand-filter + Ignore variants with >90% support on one strand [1] + + output-vcf + If set to 1, outputs in VCF format + + vcf-sample-list + For VCF output, a list of sample names in order, one per line + + variants + Report only variant (SNP/indel) positions [0] + + + + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/varscan/varscan_somatic.pl Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,138 @@ +#!/usr/bin/perl + + +use strict; +use Cwd; + +die qq( +Bad numbr of inputs + +) if(!@ARGV); + +my $options =""; +my $normal=""; +my $command=""; +my $tumor=""; +my $output=""; +my $working_dir = cwd(); +my $snp = "$working_dir/output.snp.vcf"; +my $indels = "$working_dir/output.indel.vcf"; + +foreach my $input (@ARGV) +{ + my @tmp = split "::", $input; + if($tmp[0] eq "COMMAND") + { + $command = $tmp[1]; + } + if($tmp[0] eq "NORMAL") + { + $normal = $tmp[1]; + } + elsif($tmp[0] eq "TUMOR") + { + $tumor = $tmp[1]; + } + elsif($tmp[0] eq "OPTION") + { + $options = "$options ${tmp[1]}"; + } + elsif($tmp[0] eq "OUTPUT") + { + $output = $tmp[1]; + } + + else + { + die("Unknown Input: $input\n"); + } +} + +system ("$command $normal $tumor $options "); +system("grep -v '^\#' $indels | grep -v '^chrom position' >> $snp"); + +my @chr_ord = chromosome_order($tumor); + +vs2vcf($snp, $output,\@chr_ord); + + +sub vs2vcf +{ + + # + # G l o b a l v a r i a b l e s + # + my $version = '0.1'; + + # + # Read in file + # + my $input = shift; + my $output = shift; + my $chr_ord = shift; + open(IN, $input) or die "Can't open $input': $!\n"; + open(OUT, ">$output") or die "Can't create $output': $!\n"; + my %output; + + while ( <IN> ) + { + if ( /^#/ ) + { + print OUT; + next; + } + chomp; + my $line = $_; + + my @flds = split ( "\t", $line ); + my $ref = $flds[3]; + my $alt = $flds[4]; + # + # Deletion of bases + # + if ( $alt =~ /^\-/ ) + { + ($flds[3], $flds[4]) = ($ref.substr($alt,1), $ref); + } + + # + # Insertion of bases + # + if ( $alt =~ /^\+/ ) + { + $flds[4] = $ref.substr($alt,1); + } + print OUT join( "\t", @flds),"\n" unless defined $chr_ord; + $output{$flds[0]}{$flds[1]} = join( "\t", @flds)."\n" if defined $chr_ord; + } + close(IN); + # if chromosome order given return in sorted order + if(defined $chr_ord) + { + for my $chrom (@{ $chr_ord }) + { + for my $pos (sort {$a<=>$b} keys %{ $output{$chrom} }) + { + print OUT $output{$chrom}{$pos}; + } + } + } + close(OUT); +} + + +sub chromosome_order +{ + my $input = shift; + # calculate flagstats + my $COMM = "samtools view -H $input | grep '^\@SQ'"; + my @SQ = `$COMM`; + chomp @SQ; + for(my $i = 0; $i <= $#SQ; $i++) + { + $SQ[$i] =~ s/^\@SQ\tSN:(.*?)\tLN:\d+$/$1/; + } + return(@SQ); +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngsap-vc/varscan/varscan_somatic.xml Thu Oct 03 10:42:15 2019 -0400 @@ -0,0 +1,131 @@ +<tool id="varscan_somatic" name="VarScan Somatic" version="2.3.5"> + <description> + somatic mutation caller for cancer genomics + </description> + <requirements> + <requirement type="package" version="2.3.5">VarScan</requirement> + </requirements> + <command interpreter="perl"> + varscan_somatic.pl + "COMMAND::java -jar \$JAVA_JAR_PATH/VarScan.v2.3.5.jar somatic" + "NORMAL::$normal" + "TUMOR::$tumor" + "OUTPUT::$output" + + "OPTION::--min-coverage $min_coverage" + "OPTION::--min-coverage-normal $min_coverage_normal" + "OPTION::--min-coverage-tumor $min_coverage_tumor" + + "OPTION::--min-var-freq $min_var_freq" + "OPTION::--min-freq-for-hom $min_freq_for_hom" + + "OPTION::--normal-purity $normal_purity" + "OPTION::--tumor-purity $tumor_purity" + + "OPTION::--p-value $p_value" + "OPTION::--somatic-p-value $somatic_p_value" + + "OPTION::--strand-filter $strand_filter" + "OPTION::--validation $validation" + "OPTION::--output-vcf 1" + + + + </command> + + <inputs> + + <param name="normal" type="data" format="pileup" label="normal mpileup file" help="The SAMtools mpileup file for normal sample" /> + <param name="tumor" type="data" format="pileup" label="tumor mpileup file" help="The SAMtools mpileup file for tumor sample" /> + + <param name="min_coverage" type="integer" label="min-coverage" help="" optional="true" value="8"/> + <param name="min_coverage_normal" type="integer" label="min-coverage-normal" help="" optional="true" value="8"/> + <param name="min_coverage_tumor" type="integer" label="min-coverage-tumor" help="" optional="true" value="6"/> + + <param name="min_var_freq" type="float" label="min-var-freq" help="" optional="true" value="0.10"/> + <param name="min_freq_for_hom" type="float" label="min-freq-for-hom" help="" optional="true" value="0.75"/> + + <param name="normal_purity" type="float" label="normal-purity" help="" optional="true" value="1.00"/> + <param name="tumor_purity" type="float" label="tumor-purity" help="" optional="true" value="1.00"/> + + + <param name="p_value" type="text" label="p-value" help="" optional="true" value="0.99"/> + <param name="somatic_p_value" type="text" label="somatic-p-value" help="" optional="true" value="0.05"/> + + <param name="strand_filter" type="integer" label="strand-filter" help="" optional="true" value="1"/> + <param name="validation" type="integer" label="validation" help="" optional="true" value="0"/> + + </inputs> + <outputs> + <data type="data" format="vcf" name="output" label="${tool.name} result on ${on_string}"/> + </outputs> + + <help> + +.. class:: infomark + +**What it does** + +:: + + VarScan is a platform-independent mutation caller for targeted, exome, and whole-genome resequencing data generated on Illumina, SOLiD, Life/PGM, Roche/454, and similar instruments. The newest version, VarScan 2, is written in Java, so it runs on most operating systems. It can be used to detect different types of variation: + + Germline variants (SNPs an dindels) in individual samples or pools of samples. + Multi-sample variants (shared or private) in multi-sample datasets (with mpileup). + Somatic mutations, LOH events, and germline variants in tumor-normal pairs. + Somatic copy number alterations (CNAs) in tumor-normal exome data. + + +**Input** + +:: + + mpileup normal file - The SAMtools mpileup file for normal + mpileup tumor file - The SAMtools mpileup file for tumor + + +**Parameters** + +:: + + min-coverage + Minimum read depth at a position to make a call [8] + + min-coverage-normal + Minimum coverage in normal to call somatic [8] + + min-coverage-tumor + Minimum coverage in tumor to call somatic [6] + + min-var-freq + Minimum variant frequency to call a heterozygote [0.10] + + min-freq-for-hom + Minimum frequency to call homozygote [0.75] + + normal-purity + Estimated purity (non-tumor content) of normal sample [1.00] + + tumor-purity + Estimated purity (tumor content) of tumor sample [1.00] + + p-value + Default p-value threshold for calling variants [0.99] + + somatic-p-value + P-value threshold to call a somatic site [0.05] + + strand-filter + If set to 1, removes variants with >90% strand bias + + validation + If set to 1, outputs all compared positions even if non-variant + + output-vcf + If set to 1, outputs in VCF format [Default] + + + + </help> +</tool> +
