# HG changeset patch
# User Franco Caramia
# Date 1400543940 -36000
# Node ID 2770f49cb0dc99b58d5b051b782552c50c9dbb90
# Parent e8a98923965e12aedde75aee3d69f8db89cfc208
re-uploading contra
diff -r e8a98923965e -r 2770f49cb0dc CONTRA_User_Guide.2.0.pdf
Binary file CONTRA_User_Guide.2.0.pdf has changed
diff -r e8a98923965e -r 2770f49cb0dc all_fasta.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/all_fasta.loc.sample Tue May 20 09:59:00 2014 +1000
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
+#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
+#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
diff -r e8a98923965e -r 2770f49cb0dc baseline.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline.xml Tue May 20 09:59:00 2014 +1000
@@ -0,0 +1,71 @@
+
+ : Control files for Contra
+
+ bedtools
+ samtools
+ contra
+
+
+
+ baseline_wrapper.pl
+
+ ##Required files
+ "PLAYEROPTION::-t=$target_file"
+
+ #for $group in $file_group
+ "BAMLISTENTRY::${group.bam}"
+ #end for
+
+ "PLAYEROPTION::--name=$sampleName"
+ "PLAYEROPTION::--trim=$trim"
+
+ ##File to generate the bam list
+ "BASELINEOUTPUT::$baseline_output"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+|
+
+**Reference**
+ http://contra-cnv.sourceforge.net/
+
+-----
+
+**What it does**
+
+Creating a baseline control from multiple samples is can be useful when a matched control is not available. In the CONTRA download page, we have provided several baseline files for some of the platforms that we have tried. Alternatively, the “baseline.py” script that comes with CONTRA can be used to generate a custom baseline file.
+
+-----
+
+**Parameters**
+
+::
+
+ -t, --target Target region definition file [REQUIRED] [BED format]
+
+ -f, --files Files to be converted to baselines [REQUIRED] [BAM]
+
+ -o, --output Output folder [REQUIRED]
+
+ -c, --trim Portion of outliers to be removed before calculating
+ average [Default: 0.2]
+
+ -n, --name Output baseline file name [Default: baseline]
+
+
+
+
+
+
diff -r e8a98923965e -r 2770f49cb0dc baseline_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline_wrapper.pl Tue May 20 09:59:00 2014 +1000
@@ -0,0 +1,68 @@
+#sshpass -p pmac1512 ssh -o StrictHostkeyChecking=no galaxy@pmc-bioinf03 "gmt music play $*"
+#echo "gmt music play $*"
+
+
+use strict;
+use warnings;
+use File::Basename;
+use Cwd;
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+
+my $player_options = "";
+my $baseline_output;
+
+my $dir = getcwd;
+my $variable = "";
+my $files = "";
+foreach my $input (@ARGV)
+{
+ my @tmp = split "::", $input;
+
+ if($tmp[0] eq "PLAYEROPTION")
+ {
+ $variable = $tmp[1];
+ $variable =~ s/=/ /g;
+ $player_options = "$player_options $variable";
+ }
+ elsif($tmp[0] eq "BASELINEOUTPUT")
+ {
+ $baseline_output = $tmp[1];
+ }
+ elsif($tmp[0] eq "BAMLISTENTRY")
+ {
+ $files = "$files ${tmp[1]}";
+ }
+ else
+ {
+ die("Unknown Input: $input\n");
+ }
+}
+
+
+my $working_dir = "BASELINE_OUTPUT";
+#remove extension
+
+#Create Contra Output dir
+system ("mkdir $working_dir");
+
+#run baseline
+
+system ("baseline.py --file $files --output $working_dir $player_options > /dev/null");
+
+#Search control file in output dir
+opendir(DIR, $working_dir);
+my @FILES= readdir(DIR);
+foreach my $file (@FILES)
+{
+ my ($filename,$directory,$extension) = fileparse($file, qr/\.[^.]*/);
+ if ($extension eq ".txt")
+ {
+ system ("mv $working_dir/$file $baseline_output");
+ }
+}
+closedir(DIR);
+
diff -r e8a98923965e -r 2770f49cb0dc contra.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/contra.xml Tue May 20 09:59:00 2014 +1000
@@ -0,0 +1,288 @@
+
+ : Copy Number Analysis for Targeted Resequencing
+
+ bedtools
+ samtools
+ contra
+
+
+
+ contra_wrapper.pl
+
+ ##Ref Genome
+ #if $genomeSource.refGenomeSource == "history":
+ "PLAYEROPTION::-f=${genomeSource.ownFile}"
+ #else:
+ ##use precomputed indexes
+ "PLAYEROPTION::-f=${genomeSource.indices.fields.path}"
+ #end if
+
+ ##Required files
+ "PLAYEROPTION::-t=$target_file"
+ "PLAYEROPTION::-s=$alignment_file"
+ #if $controlSource.refControlSource == "history":
+ "PLAYEROPTION::-c=${controlSource.control_file}"
+ #else:
+ ##use precomputed indexes
+ "PLAYEROPTION::-c=${controlSource.indices.fields.path}"
+ #end if
+
+ ##Optional parameter
+
+ #if $option.option == "modify_parameters":
+
+ "PLAYEROPTION::--numBin=$option.numBin"
+ "PLAYEROPTION::--minReadDepth=$option.minReadDepth"
+ "PLAYEROPTION::--minNBases=$option.minNbases"
+
+ #if str($option.sam) == "true":
+ "PLAYEROPTION::--sam"
+ #end if
+
+ #if str($option.bed) == "true":
+ "PLAYEROPTION::--bed"
+ #end if
+
+ "PLAYEROPTION::--pval=$option.pval"
+ "PLAYEROPTION::--sampleName=$option.sampleName"
+
+ #if str($option.nomultimapped) == "true":
+ "PLAYEROPTION::--nomultimapped"
+ #end if
+
+ #if str($option.plot) == "true":
+ "PLAYEROPTION::--plot"
+ #end if
+
+ "PLAYEROPTION::--minExon=$option.minExon"
+ "PLAYEROPTION::--minControlRdForCall=$option.minControlRdForCall"
+ "PLAYEROPTION::--minTestRdForCall=$option.minTestRdForCall"
+ "PLAYEROPTION::--minAvgForCall=$option.minAvgForCall"
+ "PLAYEROPTION::--maxRegionSize=$option.maxRegionSize"
+ "PLAYEROPTION::--targetRegionSize=$option.targetRegionSize"
+
+ #if str($option.largedeletion) == "true":
+ "PLAYEROPTION::--largedeletion"
+ #end if
+
+ "PLAYEROPTION::--smallSegment=$option.smallSegment"
+ "PLAYEROPTION::--targetRegionSize=$option.targetRegionSize"
+ "PLAYEROPTION::--largeSegment=$option.largeSegment"
+ "PLAYEROPTION::--lrCallStart=$option.lrCallStart"
+ "PLAYEROPTION::--lrCallEnd=$option.lrCallEnd"
+ "PLAYEROPTION::--passSize=$option.passSize"
+ #end if
+
+ ##File to generate the bam list
+ CONTRAOUTPUT::$html_file
+ CONTRADIR::$html_file.files_path
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+|
+
+
+**Reference**
+ http://contra-cnv.sourceforge.net/
+
+-----
+
+**What it does**
+
+CONTRA is a tool for copy number variation (CNV) detection for targeted resequencing data such as those from whole-exome capture data. CONTRA calls copy number gains and losses for each target region with key strategies include the use of base-level log-ratios to remove GC-content bias, correction for an imbalanced library size effect on log-ratios, and the estimation of log-ratio variations via binning and interpolation. It takes standard alignment formats (BAM/SAM) and output in variant call format (VCF 4.0) for easy integration with other next generation sequencing analysis package.
+
+
+-----
+
+**Required Parameters**
+
+::
+
+ -t, --target Target region definition file [BED format]
+
+ -s, --test Alignment file for the test sample [BAM/SAM]
+
+ -c, --control Alignment file for the control sample
+ [BAM/SAM/BED – baseline file]
+
+ --bed **option has to be supplied for control
+ with baseline file.**
+
+ -f, --fasta Reference genome [FASTA]
+
+ -o, --outFolder the folder name (and its path) to store the output
+ of the analysis (this new folder will be created –
+ error message occur if the folder exists)
+
+-----
+
+**Optional Parameters**
+
+::
+
+ --numBin Numbers of bins to group the regions. User can
+ specify multiple experiments with different numbers
+ of bins (comma separated). [Default: 20]
+
+ --minReadDepth The threshold for minimum read depth for each bases
+ (see Step 2 in CONTRA workflow) [Default: 10]
+
+ --minNBases The threshold for minimum number of bases for each
+ target regions (see Step 2 in CONTRA workflow)
+ [Default: 10]
+
+ --sam If the specified test and control samples are in
+ SAM format. [Default: False] (It will always take
+ BAM samples as default)
+
+ --bed If specified, control will be a baseline file in
+ BED format. [Default: False]
+ Please refer to the Baseline Script section for
+ instruction how to create baseline files from set
+ of BAMfiles. A set of baseline files from different
+ platform have also been provided in the CONTRA
+ download page.
+
+ --pval The p-value threshold for filtering. Based on Adjusted
+ P-Values. Only regions that pass this threshold will
+ be included in the VCF file. [Default: 0.05]
+
+ --sampleName The name to be appended to the front of the default output
+ name. By default, there will be nothing appended.
+
+ --nomultimapped The option to remove multi-mapped reads
+ (using SAMtools with mapping quality > 0).
+ [default: FALSE]
+
+ -p, --plot If specified, plots of log-ratio distribution for each
+ bin will be included in the output folder [default: FALSE]
+
+ --minExon Minimum number of exons in one bin (if less than this number
+ , bin that contains small number of exons will be merged to
+ the adjacent bins) [Default : 2000]
+
+ --minControlRdForCall Minimum Control ReadDepth for call [Default: 5]
+
+ --minTestRdForCall Minimum Test ReadDepth for call [Default: 0]
+
+ --minAvgForCall Minimum average coverage for call [Default: 20]
+
+ --maxRegionSize Maximum region size in target region (for breaking
+ large regions into smaller regions. By default,
+ maxRegionSize=0 means no breakdown). [Default : 0]
+
+ --targetRegionSize Target region size for breakdown (if maxRegionSize
+ is non-zero) [Default: 200]
+
+ -l, --largeDeletion If specified, CONTRA will run large deletion analysis (CBS).
+ User must have DNAcopy R-library installed to run the
+ analysis. [False]
+
+ --smallSegment CBS segment size for calling large variations [Default : 1]
+
+ --largeSegment CBS segment size for calling large variations [Default : 25]
+
+ --lrCallStart Log ratios start range that will be used to call CNV
+ [Default : -0.3]
+
+ --lrCallEnd Log ratios end range that will be used to call CNV
+ [Default : 0.3]
+
+ --passSize Size of exons that passed the p-value threshold compare
+ to the original exons size [Default: 0.5]
+
+
+
+
diff -r e8a98923965e -r 2770f49cb0dc contra_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/contra_wrapper.pl Tue May 20 09:59:00 2014 +1000
@@ -0,0 +1,91 @@
+use strict;
+use warnings;
+
+use FindBin;
+use File::Path qw(make_path);
+use File::Spec;
+
+
+die "Bad number of inputs" if(!@ARGV);
+
+my $player_options = "";
+my $contra_output;
+my $contra_dir;
+
+
+
+foreach my $input (@ARGV)
+{
+ my @tmp = split "::", $input;
+
+ if($tmp[0] eq "PLAYEROPTION")
+ {
+ my $variable = $tmp[1];
+ $variable =~ s/=/ /g;
+ print "$variable\n";
+ $player_options = "$player_options $variable";
+ }
+ elsif($tmp[0] eq "CONTRAOUTPUT")
+ {
+ $contra_output = $tmp[1];
+ }
+ elsif($tmp[0] eq "CONTRADIR")
+ {
+ $contra_dir = $tmp[1];
+ }
+ else
+ {
+ die("Unknown input: $input\n");
+ }
+}
+
+
+my $working_dir = "CONTRA_OUTPUT";
+make_path($contra_dir);
+#remove extension
+
+#run contra
+system(File::Spec->catfile($FindBin::Bin, 'contra.py') . " -o $working_dir $player_options > /dev/null 2>&1");
+
+#set html
+#print "$contra_output - $working_dir\n";
+open(HTML, ">$contra_output");
+print HTML "Contra: Copy Number Analysis for Targeted Resequencing