Repository 'arriba_get_filters'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/arriba_get_filters

Changeset 0:125d20cb23d7 (2022-07-27)
Next changeset 1:1fe9d667447c (2022-09-23)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/arriba commit b12158e6cc9b1b2bd6e7522dfc183e9055575823
added:
arriba_get_filters.xml
macros.xml
static/images/draw-fusions-example.png
test-data/Aligned.out.bam
test-data/Aligned.out.bam.bai
test-data/Aligned.out.sam
test-data/cytobands.tsv
test-data/fusions.tsv
test-data/genome.fasta.gz
test-data/genome.gtf.gz
test-data/protein_domains.gff3
tool-data/all_fasta.loc.sample
tool_data_table_conf.xml.sample
b
diff -r 000000000000 -r 125d20cb23d7 arriba_get_filters.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/arriba_get_filters.xml Wed Jul 27 11:25:14 2022 +0000
[
@@ -0,0 +1,73 @@
+<tool id="arriba_get_filters" name="Arriba Get Filters" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description>to history</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="version_command" />
+    <command detect_errors="exit_code"><![CDATA[
+    BASE_DIR=\$(dirname \$(dirname `which arriba`)) &&
+    REF_SCRIPT=`find \$BASE_DIR -name 'download_references.sh'` &&
+    REF_DIR=\$(dirname \$REF_SCRIPT) &&
+    REF_NAME=${arriba_reference_name.split('+')[0].replace('viral','')} &&
+    echo \$REF_NAME &&
+    cp `find \$REF_DIR -name 'blacklist_*' | grep -i \$REF_NAME` '$blacklist' && 
+    cp `find \$REF_DIR -name 'known_fusions_*' | grep -i \$REF_NAME` '$known_fusions' &&
+    cp `find \$REF_DIR -name 'protein_domains_*' | grep -i \$REF_NAME` '$protein_domains' &&
+    cp `find \$REF_DIR -name 'cytobands_*' | grep -i \$REF_NAME` '$cytobands'
+    #*
+    cp "\$REF_DIR/blacklist_*${arriba_reference_name}*" '$blacklist' &&
+    cp "\$REF_DIR/known_fusions_*${arriba_reference_name}*" '$known_fusions' &&
+    cp "\$REF_DIR/protein_domains_*${arriba_reference_name}*" '$protein_domains' &&
+    cp "\$REF_DIR/cytobands_*${arriba_reference_name}*" '$cytobands'
+    *#
+    ]]></command>
+    <inputs>
+        <param name="arriba_reference_name" type="text" label="Select reference">
+            <help>GRCh38 GRCh37 hg38 hg19 GRCm38 mm10</help>
+            <option value="GRCh38">GRCh38</option>
+            <option value="GRCh37">GRCh37</option>
+            <option value="hg38">hg38</option>
+            <option value="hg19">hg19</option>
+            <option value="GRCm39">GRCm39</option>
+            <option value="GRCm38">GRCm38</option>
+            <option value="mm39">mm39</option>
+            <option value="mm10">mm10</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="blacklist" format="tabular.gz" label="${tool.name} ${arriba_reference_name} blacklist.tsv.gz"/>
+        <data name="known_fusions" format="tabular.gz" label="${tool.name} ${arriba_reference_name} known_fusions.tsv.gz"/>
+        <data name="protein_domains" format="gff3" label="${tool.name} ${arriba_reference_name} protein_domains.gff3"/>
+        <data name="cytobands" format="tabular" label="${tool.name} ${arriba_reference_name} cytobands.tsv"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="arriba_reference_name" value="GRCh38"/>
+            <output name="cytobands">
+                <assert_contents>
+                    <has_text_matching expression="1\t1\t\d+\tp36.33\tgneg"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+**Arriba Get Filters**
+
+Arriba_ is a fast tool to search for aberrant transcripts such as gene fusions.
+It is based on chimeric alignments found by the STAR RNA-Seq aligner.
+
+The **Arriba Get Filters** tool adds the following Arriba distribution input_files_ to your galaxy history:
+
+  - blacklist
+  - known_fusions
+  - protein_domains
+  - cytobands
+
+
+.. _Arriba: https://arriba.readthedocs.io/en/latest/
+.. _input_files: https://arriba.readthedocs.io/en/latest/input-files/
+
+]]></help>
+    <expand macro="citations" />
+</tool>
b
diff -r 000000000000 -r 125d20cb23d7 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Wed Jul 27 11:25:14 2022 +0000
[
b'@@ -0,0 +1,311 @@\n+<macros>\n+    <token name="@TOOL_VERSION@">2.3.0</token>\n+    <token name="@VERSION_SUFFIX@">0</token>\n+    <xml name="requirements">\n+        <requirements>\n+        <requirement type="package" version="@TOOL_VERSION@">arriba</requirement>\n+            <yield/>\n+        </requirements>\n+    </xml>\n+    <xml name="citations">\n+        <citations>\n+            <citation type="doi">10.1101/gr.257246.119</citation>\n+            <yield />\n+        </citations>\n+    </xml>\n+    <xml name="version_command">\n+        <version_command>arriba -h | grep Version | sed \'s/^.* //\'</version_command>\n+    </xml>\n+    <xml name="genome_source" token_assembly_optional="false" >\n+        <conditional name="genome">\n+            <param name="genome_source" type="select" label="Genome assembly fasta (that was used for STAR alignment)">\n+                <option value="history">From your history</option>\n+                <option value="cached">Use built-in Genome reference</option>\n+            </param>\n+            <when value="history">\n+                <param name="assembly" argument="-a" type="data" format="fasta" optional="@ASSEMBLY_OPTIONAL@" label="Genome assembly fasta"/>\n+            </when>\n+            <when value="cached">\n+                <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">\n+                    <options from_data_table="all_fasta">\n+                        <validator type="no_options" message="No reference genomes are available" />\n+                    </options>\n+                </param>\n+            </when>\n+        </conditional>\n+    </xml>\n+    <xml name="gtf_source" token_assembly_optional="false" >\n+        <conditional name="genome_gtf">\n+            <param name="gtf_source" type="select" label="Genome GTF annotation source">\n+                <option value="history">From your history</option>\n+                <!-- <option value="cached">Use built-in Gtf annotation</option> -->\n+            </param>\n+            <when value="history">\n+                <param name="annotation" argument="-g" type="data" format="gtf" label="Gene annotation in GTF format"/>\n+            </when>\n+        </conditional>\n+    </xml>\n+\n+    <token name="@GENOME_SOURCE@"><![CDATA[\n+#if str($genome.genome_source) == "history"\n+    #if $genome.assembly\n+        #set $genome_assembly = \'genome.fa\'\n+        ln -sf \'$genome.assembly\' $genome_assembly &&\n+    #end if\n+#elif str($genome.genome_source) == "cached"\n+    #set $genome_assembly = $genome.ref_file.fields.fasta\n+#end if\n+    ]]></token>\n+    <token name="@GTF_SOURCE@"><![CDATA[\n+#if str($genome_gtf.gtf_source) == "history"\n+    #if $genome_gtf.annotation.is_of_type(\'gtf.gz\')\n+        #set $genome_annotation = \'genome.gtf.gz\'\n+    #else\n+        #set $genome_annotation = \'genome.gtf\'\n+    #end if\n+    ln -sf \'$genome_gtf.annotation\' $genome_annotation &&\n+#end if\n+    ]]></token>\n+\n+    <xml name="visualization_options">\n+                <param name="cytobands" argument="--cytobands" type="data" format="tabular" optional="true" label="Cytobands"/>\n+                <section name="options" expanded="false" title="Draw Fusion Options">\n+                    <param argument="--sampleName" type="text" value="" optional="true" label="Sample Name printed as the title on every page"/>\n+                    <param argument="--transcriptSelection" type="select" optional="true" label="Transcript selection">\n+                        <help>By default the transcript isoform with the highest coverage is drawn.\n+                             Alternatively, the transcript isoform that is provided in the columns\n+                             transcript_id1 and transcript_id2 in the given fusions file can be drawn.\n+                             Selecting the isoform with the highest coverage usually produces nicer plots,\n+                             in the sense that the coverage track is smooth and shows a visible increase in coverage '..b'              </param>\n+                    <param argument="--coverageRange" type="text" value="" optional="true" label="Maximum coverage for plot">\n+                        <help>When the parameter --alignments is used, coverage plots are drawn above the transcripts of the fused genes. \n+                              The plots can be cropped at a fixed level by passing a non-zero value to this parameter. \n+                              When only a single value is given, both coverage plots (for gene1 and gene2) are cropped at the same level. \n+                              When two comma-separated values are given, the cutoffs can be specified independently for the two plots. \n+                              A value of 0 indicates that no cropping should be applied (i.e., the cutoff is set to the peak coverage) \n+                              and that the coverage plots of both genes should be on the same scale. This is the default behavior. \n+                              A value of 0,0 also indicates that no cropping should be applied, \n+                              but the coverage plots of the two genes have different scales: \n+                              each one is scaled individually to the peak coverage of the respective gene. \n+                              Default: 0\n+                        </help>\n+                        <validator type="regex" message="">^\\d+(,\\d+)?$</validator>\n+                    </param>\n+                </section>\n+    </xml>\n+    <token name="@DRAW_FUSIONS@">\n+draw_fusions.R\n+    --fusions=\'$fusions\'\n+    --alignments=\'Aligned.sortedByCoord.out.bam\'\n+    --annotation=\'$genome_gtf.annotation\'\n+    --output=fusions.pdf\n+    #if $visualization.cytobands\n+    --cytobands=\'$visualization.cytobands\'\n+    #end if\n+    #if $protein_domains\n+    --proteinDomains=\'$protein_domains\'\n+    #end if\n+    ## Visualization Options\n+    #if $visualization.options.transcriptSelection\n+        --transcriptSelection=$visualization.options.transcriptSelection\n+    #end if\n+    #if $visualization.options.minConfidenceForCircosPlot\n+        --minConfidenceForCircosPlot=$visualization.options.minConfidenceForCircosPlot\n+    #end if\n+    #if $visualization.options.squishIntrons\n+        --squishIntrons=$visualization.options.squishIntrons\n+        #if $visualization.options.squishIntrons == \'FALSE\' and $visualization.options.showIntergenicVicinity\n+            --showIntergenicVicinity=\'$visualization.options.showIntergenicVicinity\'\n+        #end if\n+    #end if\n+    #if $visualization.options.mergeDomainsOverlappingBy\n+        --mergeDomainsOverlappingBy=$visualization.options.mergeDomainsOverlappingBy\n+    #end if\n+    #if $visualization.options.sampleName\n+        --sampleName=\'$visualization.options.sampleName\'\n+    #end if\n+    #if $visualization.options.printExonLabels\n+        --printExonLabels=$visualization.options.printExonLabels\n+    #end if\n+    #if $visualization.options.coverageRange\n+        --coverageRange=\'$visualization.options.coverageRange\'\n+    #end if\n+    #if $visualization.options.render3dEffect\n+        --render3dEffect=$visualization.options.render3dEffect\n+    #end if\n+    #if $visualization.options.optimizeDomainColors\n+        --optimizeDomainColors=$visualization.options.optimizeDomainColors\n+    #end if\n+    #if $visualization.options.color1\n+        --color1=\'$visualization.options.color1\'\n+    #end if\n+    #if $visualization.options.color2\n+        --color2=\'$visualization.options.color2\'\n+    #end if\n+    #if str($visualization.options.pdfWidth)\n+        --pdfWidth=$visualization.options.pdfWidth\n+    #end if\n+    #if str($visualization.options.pdfHeight)\n+        --pdfHeight=$visualization.options.pdfHeight\n+    #end if\n+    # fontFamily\n+    #if $visualization.options.fontFamily\n+        --fontFamily=$visualization.options.fontFamily\n+    #end if\n+    #if str($visualization.options.fontSize)\n+        --fontSize=$visualization.options.fontSize\n+    #end if\n+</token>\n+</macros>\n'
b
diff -r 000000000000 -r 125d20cb23d7 static/images/draw-fusions-example.png
b
Binary file static/images/draw-fusions-example.png has changed
b
diff -r 000000000000 -r 125d20cb23d7 test-data/Aligned.out.bam
b
Binary file test-data/Aligned.out.bam has changed
b
diff -r 000000000000 -r 125d20cb23d7 test-data/Aligned.out.bam.bai
b
Binary file test-data/Aligned.out.bam.bai has changed
b
diff -r 000000000000 -r 125d20cb23d7 test-data/Aligned.out.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Aligned.out.sam Wed Jul 27 11:25:14 2022 +0000
b
b'@@ -0,0 +1,89 @@\n+@HD\tVN:1.4\tSO:coordinate\n+@SQ\tSN:22\tLN:269079\n+@SQ\tSN:9\tLN:515509\n+@PG\tID:STAR\tPN:STAR\tVN:2.7.8a\tCL:STAR   --runThreadN 12   --genomeDir tempstargenomedir   --genomeLoad NoSharedMemory   --readFilesIn /panfs/roc/galaxy/PRODUCTION/database/files/001/368/dataset_1368710.dat   /panfs/roc/galaxy/PRODUCTION/database/files/001/368/dataset_1368711.dat      --readFilesCommand zcat      --limitBAMsortRAM 122880000000   --outSAMtype BAM   SortedByCoordinate      --outSAMstrandField intronMotif   --outSAMattributes NH   HI   AS   nM   ch      --outSAMunmapped Within      --outSAMprimaryFlag OneBestScore   --outSAMmapqUnique 60   --outBAMsortingThreadN 12   --outBAMsortingBinsN 50   --outSAMattrIHstart 1   --winAnchorMultimapNmax 50   --chimSegmentMin 12   --chimOutType WithinBAM   Junctions      --chimOutJunctionFormat 1      --quantMode TranscriptomeSAM   GeneCounts      --quantTranscriptomeBan Singleend   --twopassMode Basic\n+@CO\tuser command line: STAR --runThreadN 12 --genomeLoad NoSharedMemory --genomeDir tempstargenomedir --readFilesIn /panfs/roc/galaxy/PRODUCTION/database/files/001/368/dataset_1368710.dat /panfs/roc/galaxy/PRODUCTION/database/files/001/368/dataset_1368711.dat --readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --twopassMode Basic  --quantMode TranscriptomeSAM GeneCounts --quantTranscriptomeBan Singleend --outSAMstrandField intronMotif --outSAMattrIHstart 1 --outSAMattributes NH HI AS nM ch --outSAMprimaryFlag OneBestScore --outSAMmapqUnique 60 --outSAMunmapped Within --chimSegmentMin 12 --outBAMsortingThreadN 12 --outBAMsortingBinsN 50 --winAnchorMultimapNmax 50 --limitBAMsortRAM 122880000000 --chimOutType WithinBAM Junctions --chimOutJunctionFormat 1\n+BCR-ABL1-46\t163\t22\t225687\t60\t71M2994N7M1344N72M\t=\t225737\t5255\tAACTGGAGGCAGTGCCCAACATCCCCCTGGTGCCCGATGAGGAGCTGGACGCTTTGAACATCAAGATCTCCAAGAAGTGTTTCAGAAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTG\tCCCGGGGGG=GGGJJJGGJJJGGJJJJCJJGGJJGCJGCGGGC8J8JGGJJJJJGJJC(JGCCG=GGJJGCCCGC8GCCGGGGGG=GGCGGG1GG=GC1G=CJCJJCCCGGCGG1CGG1GGGGGGGG=GGGGGCCGCGGG8GGGCGG=GG\tNH:i:1\tHI:i:1\tAS:i:285\tnM:i:1\tXS:A:+\tNM:i:1\n+BCR-ABL1-72\t163\t22\t225696\t60\t62M2994N7M1344N81M\t=\t228752\t5264\tCAGTGCCCAACATCCCCCTGGTGCCCGATGAGGAGCTGCACGCTTTGAAGATCAAGATCTCCAAGAAGTGTTTCAGAAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCA\tCCCCGGGGGGGGGGJGJCCCJ1GJJJJGCGGGCJJJ=C1JJGGJGG8JGC=CCGJ1JGG8GGGGGJCGJCCGGGCG=CGGGGGGCGG=GGCGGG=8CCGCGGJJJ=JGGGCGGGGGCCGCCGGGGGGGGC=CCGCG8GGGGGC1GGGGCC\tNH:i:1\tHI:i:1\tAS:i:290\tnM:i:1\tXS:A:+\tNM:i:1\n+BCR-ABL1-46\t83\t22\t225737\t60\t21M2994N7M1344N105M717N17M\t=\t225687\t-5255\tGCTTTGAAGATCAAGATCTCCAAGAAGTGTTTCAGAAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGG\t=GGCGGGGGGG=GGGCCCGCCCGGGGGGGGGGCCGGGGCGG8CGCGGG1JGGCCGG(C=GCCCGGGGGGCGGGGGCGCGGCGGJCGGGJJGJGGGJJCGGGJJJGJJJJJJJGJJJJGGGJJJJJGGJJJJJGCJJJCGGGGGGGGGCCC\tNH:i:1\tHI:i:1\tAS:i:285\tnM:i:1\tXS:A:+\tNM:i:0\n+BCR-ABL1-72\t83\t22\t228752\t60\t3S7M1344N105M717N35M\t=\t225696\t-5264\tTCCAAGAAGTGTTTCAGAAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAAT\t=GGGGGG==GGGGCCCC=GGGGG=GGGGCGGGCGGGGGGG=CGGCCGCCJGGCGGGGG=GGG8GGGCGGC=G=CCJGGGGGGCGJJGJJCGGGGGGJJJGCJCCGJG=JJJGJGJJCJJJJGJJJJJJJ=GCJGJGCGGG=GGGGGGCC=\tNH:i:1\tHI:i:1\tAS:i:290\tnM:i:1\tXS:A:+\tNM:i:0\n+BCR-ABL1-4\t99\t22\t230111\t60\t97M717N53M\t=\t230176\t889\tAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCC\tC==GGGGGGGGGGJJJJ1JJJGGJJGGJGGJJGJJCJGJGJJCGGCJGCJJJJCGJGGGGJGGGGGGCCGG8JGGCGCGG=GGGGGGGGGGGGGG=GCCGJGGGCCGGGGGG1GGGGGGCGCGGCGGGGGG=GGGGGGGGGCCGCGGGCC\tNH:i:1\tHI:i:1\tAS:i:259\tnM:i:0\tch:A:1\tXS:A:+\tNM:i:0\n+BCR-ABL1-18\t99\t22\t230118\t60\t90M717N60M\t=\t230165\t882\tCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGTAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCAATCAGCCACTGGAT\tCCCGGGCGGGCGGJGJJJJJJJJJ='..b'GGTACCATGGGCCTGTGTCCCGCAATGCCGCTGAGTATCTGCTGAGCAGCGGGATCAATGGCAGCTTCTTGGTGCGTGAGAGTGAGAGCAGTCCTGGCCAGAGGTCCATCTCGCT\tCCGGGCGCGCGGGCG=CCCGGCGCGGGGC=CGGCGGCCGCGGGJJJJCCGCCG(GCCCCCGGCCGGG=G8GGGGGGCC=C=CGGJGJJJGC=JGGJJJGJGJ1JJJGC=JJJG=JCJJJJJJJ=JJGGGJJJCGJJJGGGGGCGG=GCCC\tNH:i:1\tHI:i:1\tAS:i:298\tnM:i:0\tNM:i:0\n+BCR-ABL1-74\t77\t*\t0\t0\t*\t*\t0\t0\tTCATTTTCACTGGGTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATGCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACGATGACATTC\tCC11GGGGGGGGGGCCJJJGCGJJGJJJJJGGGGGGJJJGGJG==GCJCJ=GGJJGGJJGGCJGG=GGGGGJGGJGC=GC=GGGCGGGCGGGGCCGCGGGJCGC=GGC8CGCGCGGGGGGCGCC1GGCGCC=GCCGCGGC8GCGGGCCCG\tNH:i:0\tHI:i:0\tAS:i:155\tnM:i:2\tuT:A:1\n+BCR-ABL1-74\t141\t*\t0\t0\t*\t*\t0\t0\tCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAGGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAG\tCCCGGGGGGCGCGJGGJJGGJGJJJGJGGJJGGJGJJ1=JCJJGGGJJJJGGGJGCCJGGJGG=J1JG8JGCGGGJG=GC1CGCCGGCG(GGCGGCGGGGGCJC1CCGC==CCGGGGCGGCGGGCCGGCGCGC8CCCCGGG=GGGC=GGG\tNH:i:0\tHI:i:0\tAS:i:155\tnM:i:2\tuT:A:1\n+BCR-ABL1-66\t77\t*\t0\t0\t*\t*\t0\t0\tTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATGCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACGATGACATTCAGAAACCCATAGAG\tCCC=GGGGCGGGGJJJJJGJJJJ=JJJGJJ1GJJGJJJJJGJJJJJGGGGCGJJGGGJJJGGCGGGGJGCGG1JCGGG=GCCGCG=GC=G=GCCGGGGG8JGGGGGGGGGGGG=GGCGGC8GGCCGGGC=GGGGGGGGG=CGG=8GGCCG\tNH:i:0\tHI:i:0\tAS:i:159\tnM:i:0\tuT:A:1\n+BCR-ABL1-66\t141\t*\t0\t0\t*\t*\t0\t0\tCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAG\tCCCGGGGGGGGGGGGJ=JGJJJJJJJGGJJCCCJGJJ1JJJGCJGGGGJJJJ=GGGJGJGC(GGGGJGGGJG1=GGGGGGGG=G=C=GG8CC8GGGGGCCCCJCCCJGCG=GGCCGGCGGCGGCG==1GCCGGC1GGGGGCGGGGGGCGG\tNH:i:0\tHI:i:0\tAS:i:159\tnM:i:0\tuT:A:1\n+BCR-ABL1-58\t77\t*\t0\t0\t*\t*\t0\t0\tATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAGTGAAGCCGCTCGTTGGAACTCCAAGG\tCCCGGCGGGGGGGGJJJJJGJJGJGJGJGJJJJJJJJJCJGJJJJGCG=8GGGJGJGGCGGJGCGJJJCJGGG=CGCCGGCCGGGCGCGGGCGCG1GGGCCCGGGGCG8GCCC=C8CGCGG=CCCGCCCCGGG=CCGGCGGGCGGGGGCG\tNH:i:0\tHI:i:0\tAS:i:185\tnM:i:3\tuT:A:1\n+BCR-ABL1-58\t141\t*\t0\t0\t*\t*\t0\t0\tTTGGGGTCATTTTCACTGGGTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATTCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACGATG\tCCCGGGGGGGGGGJJJJJJGJGJJJGGJ=JJJJJJJJGC=GJJGGJJGJJGG1GCJGGGG=JGGG8C=GCCGC==GGGCGGGGGG=GGG=(G=CCGCCGGGGCJJJJGGGC8GCGCGCG8CGGCCGGGCGCGCGG8CCGG8CGGGGGGGG\tNH:i:0\tHI:i:0\tAS:i:185\tnM:i:3\tuT:A:1\n+BCR-ABL1-24\t77\t*\t0\t0\t*\t*\t0\t0\tCGCAGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGGCTGAGTGAAG\tCC11GCGGGGGGGJCGJGJJCCJJJJGJJJJGJJGGJJJCJJJG8JJJ1GJ=JGGGGJJJCG=8GGCGCCGGGCCGGGCGGGGCGGGGCCGCGGCCGGG=J1GCCC1(CCGGCGGGCCGCGGGCGGGGC=GGCGCCGCC1GCGGGGGCGG\tNH:i:0\tHI:i:0\tAS:i:154\tnM:i:3\tuT:A:1\n+BCR-ABL1-24\t141\t*\t0\t0\t*\t*\t0\t0\tTTTCACTGGGTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATGCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACGATGACATTCAGAA\tC=CCGGGGGGGGCJ1GGJJJJ1JJJJJGJJ=GJJG8GGJ=GJGJJGJJGGGCGJGCGGGCGGG8GG=GJJGCG1GCGGJGCCGGCGGGCCGGGCG8GGGGG8C1==CGGCCCGCGGGGC8GCGGG8GGGCGCCGCCGCGGGCGGGGGGCG\tNH:i:0\tHI:i:0\tAS:i:154\tnM:i:3\tuT:A:1\n+BCR-ABL1-10\t77\t*\t0\t0\t*\t*\t0\t0\tAGGTTGGGGTCATTTTCACTGGGTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATGCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACG\tCC=GGGGGGGGGG1GJJJJJCJJJJJJJJJJJGJ=GJJJGCJJJJCJGJGCJGJJJGGJJJGGCCGGJGC=GGJ1C8GGGGGGCGCCGGGGGGCGGGCGCCCG1GGCGCGCGGGCC8GCGCGCGC8CCCGCGCGGGGGCGGGGGCGGCGG\tNH:i:0\tHI:i:0\tAS:i:181\tnM:i:2\tuT:A:1\n+BCR-ABL1-10\t141\t*\t0\t0\t*\t*\t0\t0\tATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAGTGAAGCCGCTCGTTGGA\t1CCGGCGGGGGG1GGJJJGCC1JJJJCCG=JGGJJGJJJ=GGGGGJJGGGGGGC1J=CJGCGGGGCGC(CGGGGG=GGGGG(G=CGGCGGGGCCCGC=CCCCJJCC8G1GGGGCGGGGGGCGCGGGGGGGCG=GGCCGCCGCC1G=GGGG\tNH:i:0\tHI:i:0\tAS:i:181\tnM:i:2\tuT:A:1\n'
b
diff -r 000000000000 -r 125d20cb23d7 test-data/cytobands.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cytobands.tsv Wed Jul 27 11:25:14 2022 +0000
b
@@ -0,0 +1,5 @@
+contig start end name giemsa
+22 1 40586 q11.22 gpos25
+22 40586 269079 q11.23 gneg
+9 1 21036 q34.11 gneg
+9 21036 515509 q34.12 gpos25
b
diff -r 000000000000 -r 125d20cb23d7 test-data/fusions.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fusions.tsv Wed Jul 27 11:25:14 2022 +0000
b
@@ -0,0 +1,2 @@
+#gene1 gene2 strand1(gene/fusion) strand2(gene/fusion) breakpoint1 breakpoint2 site1 site2 type split_reads1 split_reads2 discordant_mates coverage1 coverage2 confidence reading_frame tags retained_protein_domains closest_genomic_breakpoint1 closest_genomic_breakpoint2 gene_id1 gene_id2 transcript_id1 transcript_id2 direction1 direction2 filters fusion_transcript peptide_sequence read_identifiers
+BCR ABL1 +/+ +/+ 22:230999 9:275100 CDS/splice-site CDS/splice-site translocation 1 3 0 3 8 low in-frame . Bcr-Abl_oncoprotein_oligomerisation_domain(100%),C2_domain(100%),RhoGEF_domain(100%)|F-actin_binding(100%),Protein_kinase_domain(100%),SH2_domain(100%),SH3_domain(100%) . . ENSG00000186716 ENSG00000097007 ENST00000305877 ENST00000372348 downstream upstream . AGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGGAAG___ATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAA|AAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAGTGAAGCCGCTCGTTGGAACTCCAAGGAAAACCTTCTCGCTGGACCCAGTGAAAATGACCCCAACCTTTTCGTTGCACTGTATGATTTTGTGGCCAGTGGAGATAACACTCTAAGCATAACTAAAG___GTGAAAAGCTCCGGG SFSLTSVELQMLTNSCVKLQTVHSIPLTINKEDDESPGLYGFLNVIVHSATGFKQSS|kALQRPVASDFEPQGLSEAARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLR BCR-ABL1-4,BCR-ABL1-28,BCR-ABL1-60,BCR-ABL1-76
b
diff -r 000000000000 -r 125d20cb23d7 test-data/genome.fasta.gz
b
Binary file test-data/genome.fasta.gz has changed
b
diff -r 000000000000 -r 125d20cb23d7 test-data/genome.gtf.gz
b
Binary file test-data/genome.gtf.gz has changed
b
diff -r 000000000000 -r 125d20cb23d7 test-data/protein_domains.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_domains.gff3 Wed Jul 27 11:25:14 2022 +0000
b
b'@@ -0,0 +1,83 @@\n+9\tpfam\tprotein_domain\t33502\t33541\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t33992\t34063\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t35324\t35381\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t37391\t37409\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t37479\t37553\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t38833\t38931\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t41390\t41413\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t41489\t41494\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t43744\t43846\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t44647\t44729\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t47496\t47541\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t51664\t51812\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t102331\t102396\t0\t+\t.\tName=Zinc finger%2C C2H2 type;color=#80FF00;gene_id=ENSG00000130711;gene_name=PRDM12;protein_domain_id=PF00096\n+9\tpfam\tprotein_domain\t102412\t102480\t0\t+\t.\tName=C2H2-type zinc finger;color=#80FF80;gene_id=ENSG00000130711;gene_name=PRDM12;protein_domain_id=PF13894\n+9\tpfam\tprotein_domain\t114903\t114949\t0\t+\t.\tName=Exosome complex exonuclease RRP4 N-terminal region;color=#FF0000;gene_id=ENSG00000130713;gene_name=EXOSC2;protein_domain_id=PF14382\n+9\tpfam\tprotein_domain\t116528\t116596\t0\t+\t.\tName=Exosome complex exonuclease RRP4 N-terminal region;color=#FF0000;gene_id=ENSG00000130713;gene_name=EXOSC2;protein_domain_id=PF14382\n+9\tpfam\tprotein_domain\t121951\t121971\t0\t+\t.\tName=KH domain;color=#000080;gene_id=ENSG00000130713;gene_name=EXOSC2;protein_domain_id=PF15985\n+9\tpfam\tprotein_domain\t123179\t123300\t0\t+\t.\tName=KH domain;color=#000080;gene_id=ENSG00000130713;gene_name=EXOSC2;protein_domain_id=PF15985\n+9\tpfam\tprotein_domain\t275219\t275273\t0\t+\t.\tName=SH3 domain;color=#FF0000;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00018\n+9\tpfam\tprotein_domain\t275837\t275922\t0\t+\t.\tName=SH3 domain;color=#FF0000;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00018\n+9\tpfam\tprotein_domain\t275962\t276132\t0\t+\t.\tName=SH2 domain;color=#80FFFF;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00017\n+9\tpfam\tprotein_domain\t283799\t283855\t0\t+\t.\tName=SH2 domain;color=#80FFFF;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00017\n+9\tpfam\tprotein_domain\t283973\t284071\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t293165\t293249\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t293896\t294073\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t295904\t296088\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t299451\t299603\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t301104\t301156\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;ge'..b'd=PF00053\n+9\tpfam\tprotein_domain\t489945\t490067\t0\t+\t.\tName=Laminin EGF domain;color=#FFFFFF;gene_id=ENSG00000050555;gene_name=LAMC3;protein_domain_id=PF00053\n+9\tpfam\tprotein_domain\t490710\t490856\t0\t+\t.\tName=Laminin EGF domain;color=#FFFFFF;gene_id=ENSG00000050555;gene_name=LAMC3;protein_domain_id=PF00053\n+22\tpfam\tprotein_domain\t2420\t2524\t0\t-\t.\tName=Armadillo/beta-catenin-like repeat;color=#000080;gene_id=ENSG00000100218;gene_name=RSPH14;protein_domain_id=PF00514\n+22\tpfam\tprotein_domain\t36321\t37004\t0\t+\t.\tName=G-protein alpha subunit;color=#80FFFF;gene_id=ENSG00000128266;gene_name=GNAZ;protein_domain_id=PF00503\n+22\tpfam\tprotein_domain\t63673\t63981\t0\t+\t.\tName=G-protein alpha subunit;color=#80FFFF;gene_id=ENSG00000128266;gene_name=GNAZ;protein_domain_id=PF00503\n+22\tpfam\tprotein_domain\t90736\t90740\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t93060\t93112\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t93619\t93720\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t96554\t96622\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t98578\t98629\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t99484\t99565\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t99749\t99839\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t101465\t101502\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t121553\t121771\t0\t+\t.\tName=Bcr-Abl oncoprotein oligomerisation domain;color=#FF0000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF09036\n+22\tpfam\tprotein_domain\t201581\t201640\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t201941\t202126\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t208994\t209101\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t212118\t212178\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t213667\t213719\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t214220\t214312\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t230954\t230999\t0\t+\t.\tName=C2 domain;color=#00FF00;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00168\n+22\tpfam\tprotein_domain\t233127\t233224\t0\t+\t.\tName=C2 domain;color=#00FF00;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00168\n+22\tpfam\tprotein_domain\t235610\t235741\t0\t+\t.\tName=C2 domain;color=#00FF00;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00168\n+22\tpfam\tprotein_domain\t250010\t250018\t0\t+\t.\tName=C2 domain;color=#00FF00;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00168\n+22\tpfam\tprotein_domain\t252302\t252422\t0\t+\t.\tName=RhoGAP domain;color=#FFFFFF;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00620\n+22\tpfam\tprotein_domain\t253473\t253607\t0\t+\t.\tName=RhoGAP domain;color=#FFFFFF;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00620\n+22\tpfam\tprotein_domain\t254554\t254659\t0\t+\t.\tName=RhoGAP domain;color=#FFFFFF;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00620\n+22\tpfam\tprotein_domain\t255138\t255228\t0\t+\t.\tName=RhoGAP domain;color=#FFFFFF;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00620\n'
b
diff -r 000000000000 -r 125d20cb23d7 tool-data/all_fasta.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/all_fasta.loc.sample Wed Jul 27 11:25:14 2022 +0000
b
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id> <dbkey> <display_name> <file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
+#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
+#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
\ No newline at end of file
b
diff -r 000000000000 -r 125d20cb23d7 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Wed Jul 27 11:25:14 2022 +0000
b
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+</tables>