Repository 'arriba_draw_fusions'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/arriba_draw_fusions

Changeset 0:2d4e3aff9dc7 (2022-07-27)
Next changeset 1:cf18a5993aa2 (2022-09-23)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/arriba commit b12158e6cc9b1b2bd6e7522dfc183e9055575823
added:
arriba_draw_fusions.xml
macros.xml
static/images/draw-fusions-example.png
test-data/Aligned.out.bam
test-data/Aligned.out.bam.bai
test-data/Aligned.out.sam
test-data/cytobands.tsv
test-data/fusions.tsv
test-data/genome.fasta.gz
test-data/genome.gtf.gz
test-data/protein_domains.gff3
tool-data/all_fasta.loc.sample
tool_data_table_conf.xml.sample
b
diff -r 000000000000 -r 2d4e3aff9dc7 arriba_draw_fusions.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/arriba_draw_fusions.xml Wed Jul 27 11:25:43 2022 +0000
[
@@ -0,0 +1,148 @@
+<tool id="arriba_draw_fusions" name="Arriba Draw Fusions" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
+    <description></description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="version_command" />
+    <command detect_errors="exit_code"><![CDATA[
+    #if $alignments.extension == 'sam'
+        ln -sf '$genome.assembly' genome.fa &&
+        samtools faidx genome.fa &&
+        samtools view -b -@ \${GALAXY_SLOTS:-1} -t genome.fa.fai '$alignments' | 
+        samtools sort -O bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o Aligned.sortedByCoord.out.bam &&
+        samtools index Aligned.sortedByCoord.out.bam &&
+    #else
+        ln -sf '${alignments}' 'Aligned.sortedByCoord.out.bam' &&
+        ln -sf '$alignments.metadata.bam_index' 'Aligned.sortedByCoord.out.bam.bai' &&
+    #end if
+    @DRAW_FUSIONS@
+    ]]></command>
+    <inputs>
+        <param argument="--fusions" type="data" format="tabular" label="Arriba fusions.tsv"/>
+        <param argument="--alignments" type="data" format="sam,bam" label="STAR Aligned.out.bam"/>
+        <expand macro="genome_source" assembly_optional="true"/>
+        <expand macro="gtf_source"/>
+        <param name="protein_domains" argument="-p" type="data" format="gff3" optional="true" label="File containing protein domains"/>
+        <section name="visualization" expanded="true" title="Visualization Options">
+             <expand macro="visualization_options" />
+        </section>
+    </inputs>
+    <outputs>
+        <data name="fusions_pdf" format="pdf" label="${tool.name} on ${on_string}: fusions.pdf" from_work_dir="fusions.pdf"/>
+    </outputs>
+    <tests>
+        <!-- Test 1 - From SAM -->
+        <test> 
+            <param name="fusions" ftype="tabular" value="fusions.tsv"/>
+            <param name="alignments" ftype="sam" value="Aligned.out.sam"/>
+            <conditional name="genome">
+                <param name="genome_source" value="history"/>
+                <param name="assembly" ftype="fasta" value="genome.fasta.gz"/>
+            </conditional>
+            <conditional name="genome_gtf">
+                <param name="gtf_source" value="history"/>
+                <param name="annotation" ftype="gtf" value="genome.gtf.gz"/>
+            </conditional>
+            <param name="protein_domains" ftype="gff3" value="protein_domains.gff3"/>
+            <section name="visualization">
+                <param name="cytobands" ftype="tabular" value="cytobands.tsv"/>
+            </section>
+            <output name="fusions_pdf">
+                <assert_contents>
+                    <has_size value="64000" delta="5000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- Test 2 - From BAM -->
+        <test> 
+            <param name="fusions" ftype="tabular" value="fusions.tsv"/>
+            <param name="alignments" ftype="bam" value="Aligned.out.bam"/>
+            <conditional name="genome">
+                <param name="genome_source" value="history"/>
+                <param name="assembly" ftype="fasta" value="genome.fasta.gz"/>
+            </conditional>
+            <conditional name="genome_gtf">
+                <param name="gtf_source" value="history"/>
+                <param name="annotation" ftype="gtf" value="genome.gtf.gz"/>
+            </conditional>
+            <param name="protein_domains" ftype="gff3" value="protein_domains.gff3"/>
+            <section name="visualization">
+                <param name="cytobands" ftype="tabular" value="cytobands.tsv"/>
+            </section>
+            <section name="options">
+                <param name="sampleName" value="My Test"/>
+            </section>
+            <output name="fusions_pdf">
+                <assert_contents>
+                    <has_size value="64000" delta="5000" />
+                </assert_contents>
+            </output>
+        </test>
+
+    </tests>
+    <help><![CDATA[
+**Arriba Draw Fusions**
+
+Arriba_Draw_Fusions_ (draw_fusions.R) renders publication-quality visualizations of the transcripts involved in predicted fusions. It generates a PDF file with one page for each predicted fusion. Each page depicts the fusion partners, their orientation, the retained exons in the fusion transcript, statistics about the number of supporting reads, and - if the column fusion_transcript has a value - an excerpt of the sequence around the breakpoint.
+
+
+**INPUTS**
+
+See: https://arriba.readthedocs.io/en/latest/command-line-options/#draw_fusionsr
+
+  - Fusions  
+
+    File containing fusion predictions from Arriba_ (fusions.tsv) or STAR-Fusion (star-fusion.fusion_predictions.tsv or star-fusion.fusion_predictions.abridged.coding_effect.tsv).
+
+  - Annotation
+
+    Gene annotation in GTF format that was used by the STAR aligner. 
+
+  - Alignments
+
+    BAM file containing normal alignments from STAR. 
+
+  - Annotation
+
+    The gene annotation (parameter -g) is used for multiple purposes:
+
+  - Assembly (Optional)
+
+    Only required when alignments are not sorted bam format.  The genonme assembly will be used by samtools to produce a sorted bam file.
+
+  - Protein domains (Optional)
+
+    GFF3 file containing the genomic coordinates of protein domains. Distributions of Arriba offer protein domain annotations for all supported assemblies in the database directory. When this file is given, a plot is generated, which shows the protein domains retained in the fusion transcript. 
+
+  - Cytobands (Optional)
+
+    Coordinates of the Giemsa staining bands. This information is used to draw ideograms. If the argument is omitted, then no ideograms are rendered. The file must have the following columns: contig, start, end, name, giemsa. Recognized values for the Giemsa staining intensity are: gneg, gpos followed by a percentage, acen, stalk. Cytobands forahuman and mouse reference can be retrieved from the Arriba distribution with the **Arriba Get Filters** tool. 
+
+
+**OPTIONS**
+
+  See: https://arriba.readthedocs.io/en/latest/command-line-options/#draw_fusionsr
+
+
+**OUTPUTS**
+
+See: https://arriba.readthedocs.io/en/latest/visualization/
+
+  - fusions.pdf
+
+    A PDF file with one page for each predicted fusion. Each page depicts the fusion partners, their orientation, the retained exons in the fusion transcript, statistics about the number of supporting reads, and if the column fusion_transcript has a value an excerpt of the sequence around the breakpoint.
+
+.. image:: draw-fusions-example.png
+  :width: 800
+  :height: 467
+
+
+
+
+.. _Arriba_Draw_Fusions: https://arriba.readthedocs.io/en/latest/visualization/
+.. _Arriba: https://arriba.readthedocs.io/en/latest/
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>
b
diff -r 000000000000 -r 2d4e3aff9dc7 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Wed Jul 27 11:25:43 2022 +0000
[
b'@@ -0,0 +1,311 @@\n+<macros>\n+    <token name="@TOOL_VERSION@">2.3.0</token>\n+    <token name="@VERSION_SUFFIX@">0</token>\n+    <xml name="requirements">\n+        <requirements>\n+        <requirement type="package" version="@TOOL_VERSION@">arriba</requirement>\n+            <yield/>\n+        </requirements>\n+    </xml>\n+    <xml name="citations">\n+        <citations>\n+            <citation type="doi">10.1101/gr.257246.119</citation>\n+            <yield />\n+        </citations>\n+    </xml>\n+    <xml name="version_command">\n+        <version_command>arriba -h | grep Version | sed \'s/^.* //\'</version_command>\n+    </xml>\n+    <xml name="genome_source" token_assembly_optional="false" >\n+        <conditional name="genome">\n+            <param name="genome_source" type="select" label="Genome assembly fasta (that was used for STAR alignment)">\n+                <option value="history">From your history</option>\n+                <option value="cached">Use built-in Genome reference</option>\n+            </param>\n+            <when value="history">\n+                <param name="assembly" argument="-a" type="data" format="fasta" optional="@ASSEMBLY_OPTIONAL@" label="Genome assembly fasta"/>\n+            </when>\n+            <when value="cached">\n+                <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">\n+                    <options from_data_table="all_fasta">\n+                        <validator type="no_options" message="No reference genomes are available" />\n+                    </options>\n+                </param>\n+            </when>\n+        </conditional>\n+    </xml>\n+    <xml name="gtf_source" token_assembly_optional="false" >\n+        <conditional name="genome_gtf">\n+            <param name="gtf_source" type="select" label="Genome GTF annotation source">\n+                <option value="history">From your history</option>\n+                <!-- <option value="cached">Use built-in Gtf annotation</option> -->\n+            </param>\n+            <when value="history">\n+                <param name="annotation" argument="-g" type="data" format="gtf" label="Gene annotation in GTF format"/>\n+            </when>\n+        </conditional>\n+    </xml>\n+\n+    <token name="@GENOME_SOURCE@"><![CDATA[\n+#if str($genome.genome_source) == "history"\n+    #if $genome.assembly\n+        #set $genome_assembly = \'genome.fa\'\n+        ln -sf \'$genome.assembly\' $genome_assembly &&\n+    #end if\n+#elif str($genome.genome_source) == "cached"\n+    #set $genome_assembly = $genome.ref_file.fields.fasta\n+#end if\n+    ]]></token>\n+    <token name="@GTF_SOURCE@"><![CDATA[\n+#if str($genome_gtf.gtf_source) == "history"\n+    #if $genome_gtf.annotation.is_of_type(\'gtf.gz\')\n+        #set $genome_annotation = \'genome.gtf.gz\'\n+    #else\n+        #set $genome_annotation = \'genome.gtf\'\n+    #end if\n+    ln -sf \'$genome_gtf.annotation\' $genome_annotation &&\n+#end if\n+    ]]></token>\n+\n+    <xml name="visualization_options">\n+                <param name="cytobands" argument="--cytobands" type="data" format="tabular" optional="true" label="Cytobands"/>\n+                <section name="options" expanded="false" title="Draw Fusion Options">\n+                    <param argument="--sampleName" type="text" value="" optional="true" label="Sample Name printed as the title on every page"/>\n+                    <param argument="--transcriptSelection" type="select" optional="true" label="Transcript selection">\n+                        <help>By default the transcript isoform with the highest coverage is drawn.\n+                             Alternatively, the transcript isoform that is provided in the columns\n+                             transcript_id1 and transcript_id2 in the given fusions file can be drawn.\n+                             Selecting the isoform with the highest coverage usually produces nicer plots,\n+                             in the sense that the coverage track is smooth and shows a visible increase in coverage '..b'              </param>\n+                    <param argument="--coverageRange" type="text" value="" optional="true" label="Maximum coverage for plot">\n+                        <help>When the parameter --alignments is used, coverage plots are drawn above the transcripts of the fused genes. \n+                              The plots can be cropped at a fixed level by passing a non-zero value to this parameter. \n+                              When only a single value is given, both coverage plots (for gene1 and gene2) are cropped at the same level. \n+                              When two comma-separated values are given, the cutoffs can be specified independently for the two plots. \n+                              A value of 0 indicates that no cropping should be applied (i.e., the cutoff is set to the peak coverage) \n+                              and that the coverage plots of both genes should be on the same scale. This is the default behavior. \n+                              A value of 0,0 also indicates that no cropping should be applied, \n+                              but the coverage plots of the two genes have different scales: \n+                              each one is scaled individually to the peak coverage of the respective gene. \n+                              Default: 0\n+                        </help>\n+                        <validator type="regex" message="">^\\d+(,\\d+)?$</validator>\n+                    </param>\n+                </section>\n+    </xml>\n+    <token name="@DRAW_FUSIONS@">\n+draw_fusions.R\n+    --fusions=\'$fusions\'\n+    --alignments=\'Aligned.sortedByCoord.out.bam\'\n+    --annotation=\'$genome_gtf.annotation\'\n+    --output=fusions.pdf\n+    #if $visualization.cytobands\n+    --cytobands=\'$visualization.cytobands\'\n+    #end if\n+    #if $protein_domains\n+    --proteinDomains=\'$protein_domains\'\n+    #end if\n+    ## Visualization Options\n+    #if $visualization.options.transcriptSelection\n+        --transcriptSelection=$visualization.options.transcriptSelection\n+    #end if\n+    #if $visualization.options.minConfidenceForCircosPlot\n+        --minConfidenceForCircosPlot=$visualization.options.minConfidenceForCircosPlot\n+    #end if\n+    #if $visualization.options.squishIntrons\n+        --squishIntrons=$visualization.options.squishIntrons\n+        #if $visualization.options.squishIntrons == \'FALSE\' and $visualization.options.showIntergenicVicinity\n+            --showIntergenicVicinity=\'$visualization.options.showIntergenicVicinity\'\n+        #end if\n+    #end if\n+    #if $visualization.options.mergeDomainsOverlappingBy\n+        --mergeDomainsOverlappingBy=$visualization.options.mergeDomainsOverlappingBy\n+    #end if\n+    #if $visualization.options.sampleName\n+        --sampleName=\'$visualization.options.sampleName\'\n+    #end if\n+    #if $visualization.options.printExonLabels\n+        --printExonLabels=$visualization.options.printExonLabels\n+    #end if\n+    #if $visualization.options.coverageRange\n+        --coverageRange=\'$visualization.options.coverageRange\'\n+    #end if\n+    #if $visualization.options.render3dEffect\n+        --render3dEffect=$visualization.options.render3dEffect\n+    #end if\n+    #if $visualization.options.optimizeDomainColors\n+        --optimizeDomainColors=$visualization.options.optimizeDomainColors\n+    #end if\n+    #if $visualization.options.color1\n+        --color1=\'$visualization.options.color1\'\n+    #end if\n+    #if $visualization.options.color2\n+        --color2=\'$visualization.options.color2\'\n+    #end if\n+    #if str($visualization.options.pdfWidth)\n+        --pdfWidth=$visualization.options.pdfWidth\n+    #end if\n+    #if str($visualization.options.pdfHeight)\n+        --pdfHeight=$visualization.options.pdfHeight\n+    #end if\n+    # fontFamily\n+    #if $visualization.options.fontFamily\n+        --fontFamily=$visualization.options.fontFamily\n+    #end if\n+    #if str($visualization.options.fontSize)\n+        --fontSize=$visualization.options.fontSize\n+    #end if\n+</token>\n+</macros>\n'
b
diff -r 000000000000 -r 2d4e3aff9dc7 static/images/draw-fusions-example.png
b
Binary file static/images/draw-fusions-example.png has changed
b
diff -r 000000000000 -r 2d4e3aff9dc7 test-data/Aligned.out.bam
b
Binary file test-data/Aligned.out.bam has changed
b
diff -r 000000000000 -r 2d4e3aff9dc7 test-data/Aligned.out.bam.bai
b
Binary file test-data/Aligned.out.bam.bai has changed
b
diff -r 000000000000 -r 2d4e3aff9dc7 test-data/Aligned.out.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Aligned.out.sam Wed Jul 27 11:25:43 2022 +0000
b
b'@@ -0,0 +1,89 @@\n+@HD\tVN:1.4\tSO:coordinate\n+@SQ\tSN:22\tLN:269079\n+@SQ\tSN:9\tLN:515509\n+@PG\tID:STAR\tPN:STAR\tVN:2.7.8a\tCL:STAR   --runThreadN 12   --genomeDir tempstargenomedir   --genomeLoad NoSharedMemory   --readFilesIn /panfs/roc/galaxy/PRODUCTION/database/files/001/368/dataset_1368710.dat   /panfs/roc/galaxy/PRODUCTION/database/files/001/368/dataset_1368711.dat      --readFilesCommand zcat      --limitBAMsortRAM 122880000000   --outSAMtype BAM   SortedByCoordinate      --outSAMstrandField intronMotif   --outSAMattributes NH   HI   AS   nM   ch      --outSAMunmapped Within      --outSAMprimaryFlag OneBestScore   --outSAMmapqUnique 60   --outBAMsortingThreadN 12   --outBAMsortingBinsN 50   --outSAMattrIHstart 1   --winAnchorMultimapNmax 50   --chimSegmentMin 12   --chimOutType WithinBAM   Junctions      --chimOutJunctionFormat 1      --quantMode TranscriptomeSAM   GeneCounts      --quantTranscriptomeBan Singleend   --twopassMode Basic\n+@CO\tuser command line: STAR --runThreadN 12 --genomeLoad NoSharedMemory --genomeDir tempstargenomedir --readFilesIn /panfs/roc/galaxy/PRODUCTION/database/files/001/368/dataset_1368710.dat /panfs/roc/galaxy/PRODUCTION/database/files/001/368/dataset_1368711.dat --readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --twopassMode Basic  --quantMode TranscriptomeSAM GeneCounts --quantTranscriptomeBan Singleend --outSAMstrandField intronMotif --outSAMattrIHstart 1 --outSAMattributes NH HI AS nM ch --outSAMprimaryFlag OneBestScore --outSAMmapqUnique 60 --outSAMunmapped Within --chimSegmentMin 12 --outBAMsortingThreadN 12 --outBAMsortingBinsN 50 --winAnchorMultimapNmax 50 --limitBAMsortRAM 122880000000 --chimOutType WithinBAM Junctions --chimOutJunctionFormat 1\n+BCR-ABL1-46\t163\t22\t225687\t60\t71M2994N7M1344N72M\t=\t225737\t5255\tAACTGGAGGCAGTGCCCAACATCCCCCTGGTGCCCGATGAGGAGCTGGACGCTTTGAACATCAAGATCTCCAAGAAGTGTTTCAGAAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTG\tCCCGGGGGG=GGGJJJGGJJJGGJJJJCJJGGJJGCJGCGGGC8J8JGGJJJJJGJJC(JGCCG=GGJJGCCCGC8GCCGGGGGG=GGCGGG1GG=GC1G=CJCJJCCCGGCGG1CGG1GGGGGGGG=GGGGGCCGCGGG8GGGCGG=GG\tNH:i:1\tHI:i:1\tAS:i:285\tnM:i:1\tXS:A:+\tNM:i:1\n+BCR-ABL1-72\t163\t22\t225696\t60\t62M2994N7M1344N81M\t=\t228752\t5264\tCAGTGCCCAACATCCCCCTGGTGCCCGATGAGGAGCTGCACGCTTTGAAGATCAAGATCTCCAAGAAGTGTTTCAGAAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCA\tCCCCGGGGGGGGGGJGJCCCJ1GJJJJGCGGGCJJJ=C1JJGGJGG8JGC=CCGJ1JGG8GGGGGJCGJCCGGGCG=CGGGGGGCGG=GGCGGG=8CCGCGGJJJ=JGGGCGGGGGCCGCCGGGGGGGGC=CCGCG8GGGGGC1GGGGCC\tNH:i:1\tHI:i:1\tAS:i:290\tnM:i:1\tXS:A:+\tNM:i:1\n+BCR-ABL1-46\t83\t22\t225737\t60\t21M2994N7M1344N105M717N17M\t=\t225687\t-5255\tGCTTTGAAGATCAAGATCTCCAAGAAGTGTTTCAGAAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGG\t=GGCGGGGGGG=GGGCCCGCCCGGGGGGGGGGCCGGGGCGG8CGCGGG1JGGCCGG(C=GCCCGGGGGGCGGGGGCGCGGCGGJCGGGJJGJGGGJJCGGGJJJGJJJJJJJGJJJJGGGJJJJJGGJJJJJGCJJJCGGGGGGGGGCCC\tNH:i:1\tHI:i:1\tAS:i:285\tnM:i:1\tXS:A:+\tNM:i:0\n+BCR-ABL1-72\t83\t22\t228752\t60\t3S7M1344N105M717N35M\t=\t225696\t-5264\tTCCAAGAAGTGTTTCAGAAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAAT\t=GGGGGG==GGGGCCCC=GGGGG=GGGGCGGGCGGGGGGG=CGGCCGCCJGGCGGGGG=GGG8GGGCGGC=G=CCJGGGGGGCGJJGJJCGGGGGGJJJGCJCCGJG=JJJGJGJJCJJJJGJJJJJJJ=GCJGJGCGGG=GGGGGGCC=\tNH:i:1\tHI:i:1\tAS:i:290\tnM:i:1\tXS:A:+\tNM:i:0\n+BCR-ABL1-4\t99\t22\t230111\t60\t97M717N53M\t=\t230176\t889\tAGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCC\tC==GGGGGGGGGGJJJJ1JJJGGJJGGJGGJJGJJCJGJGJJCGGCJGCJJJJCGJGGGGJGGGGGGCCGG8JGGCGCGG=GGGGGGGGGGGGGG=GCCGJGGGCCGGGGGG1GGGGGGCGCGGCGGGGGG=GGGGGGGGGCCGCGGGCC\tNH:i:1\tHI:i:1\tAS:i:259\tnM:i:0\tch:A:1\tXS:A:+\tNM:i:0\n+BCR-ABL1-18\t99\t22\t230118\t60\t90M717N60M\t=\t230165\t882\tCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGTAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCAATCAGCCACTGGAT\tCCCGGGCGGGCGGJGJJJJJJJJJ='..b'GGTACCATGGGCCTGTGTCCCGCAATGCCGCTGAGTATCTGCTGAGCAGCGGGATCAATGGCAGCTTCTTGGTGCGTGAGAGTGAGAGCAGTCCTGGCCAGAGGTCCATCTCGCT\tCCGGGCGCGCGGGCG=CCCGGCGCGGGGC=CGGCGGCCGCGGGJJJJCCGCCG(GCCCCCGGCCGGG=G8GGGGGGCC=C=CGGJGJJJGC=JGGJJJGJGJ1JJJGC=JJJG=JCJJJJJJJ=JJGGGJJJCGJJJGGGGGCGG=GCCC\tNH:i:1\tHI:i:1\tAS:i:298\tnM:i:0\tNM:i:0\n+BCR-ABL1-74\t77\t*\t0\t0\t*\t*\t0\t0\tTCATTTTCACTGGGTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATGCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACGATGACATTC\tCC11GGGGGGGGGGCCJJJGCGJJGJJJJJGGGGGGJJJGGJG==GCJCJ=GGJJGGJJGGCJGG=GGGGGJGGJGC=GC=GGGCGGGCGGGGCCGCGGGJCGC=GGC8CGCGCGGGGGGCGCC1GGCGCC=GCCGCGGC8GCGGGCCCG\tNH:i:0\tHI:i:0\tAS:i:155\tnM:i:2\tuT:A:1\n+BCR-ABL1-74\t141\t*\t0\t0\t*\t*\t0\t0\tCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAGGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAG\tCCCGGGGGGCGCGJGGJJGGJGJJJGJGGJJGGJGJJ1=JCJJGGGJJJJGGGJGCCJGGJGG=J1JG8JGCGGGJG=GC1CGCCGGCG(GGCGGCGGGGGCJC1CCGC==CCGGGGCGGCGGGCCGGCGCGC8CCCCGGG=GGGC=GGG\tNH:i:0\tHI:i:0\tAS:i:155\tnM:i:2\tuT:A:1\n+BCR-ABL1-66\t77\t*\t0\t0\t*\t*\t0\t0\tTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATGCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACGATGACATTCAGAAACCCATAGAG\tCCC=GGGGCGGGGJJJJJGJJJJ=JJJGJJ1GJJGJJJJJGJJJJJGGGGCGJJGGGJJJGGCGGGGJGCGG1JCGGG=GCCGCG=GC=G=GCCGGGGG8JGGGGGGGGGGGG=GGCGGC8GGCCGGGC=GGGGGGGGG=CGG=8GGCCG\tNH:i:0\tHI:i:0\tAS:i:159\tnM:i:0\tuT:A:1\n+BCR-ABL1-66\t141\t*\t0\t0\t*\t*\t0\t0\tCATTCCGCTGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAG\tCCCGGGGGGGGGGGGJ=JGJJJJJJJGGJJCCCJGJJ1JJJGCJGGGGJJJJ=GGGJGJGC(GGGGJGGGJG1=GGGGGGGG=G=C=GG8CC8GGGGGCCCCJCCCJGCG=GGCCGGCGGCGGCG==1GCCGGC1GGGGGCGGGGGGCGG\tNH:i:0\tHI:i:0\tAS:i:159\tnM:i:0\tuT:A:1\n+BCR-ABL1-58\t77\t*\t0\t0\t*\t*\t0\t0\tATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAGTGAAGCCGCTCGTTGGAACTCCAAGG\tCCCGGCGGGGGGGGJJJJJGJJGJGJGJGJJJJJJJJJCJGJJJJGCG=8GGGJGJGGCGGJGCGJJJCJGGG=CGCCGGCCGGGCGCGGGCGCG1GGGCCCGGGGCG8GCCC=C8CGCGG=CCCGCCCCGGG=CCGGCGGGCGGGGGCG\tNH:i:0\tHI:i:0\tAS:i:185\tnM:i:3\tuT:A:1\n+BCR-ABL1-58\t141\t*\t0\t0\t*\t*\t0\t0\tTTGGGGTCATTTTCACTGGGTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATTCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACGATG\tCCCGGGGGGGGGGJJJJJJGJGJJJGGJ=JJJJJJJJGC=GJJGGJJGJJGG1GCJGGGG=JGGG8C=GCCGC==GGGCGGGGGG=GGG=(G=CCGCCGGGGCJJJJGGGC8GCGCGCG8CGGCCGGGCGCGCGG8CCGG8CGGGGGGGG\tNH:i:0\tHI:i:0\tAS:i:185\tnM:i:3\tuT:A:1\n+BCR-ABL1-24\t77\t*\t0\t0\t*\t*\t0\t0\tCGCAGACCATCAATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGGCTGAGTGAAG\tCC11GCGGGGGGGJCGJGJJCCJJJJGJJJJGJJGGJJJCJJJG8JJJ1GJ=JGGGGJJJCG=8GGCGCCGGGCCGGGCGGGGCGGGGCCGCGGCCGGG=J1GCCC1(CCGGCGGGCCGCGGGCGGGGC=GGCGCCGCC1GCGGGGGCGG\tNH:i:0\tHI:i:0\tAS:i:154\tnM:i:3\tuT:A:1\n+BCR-ABL1-24\t141\t*\t0\t0\t*\t*\t0\t0\tTTTCACTGGGTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATGCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACGATGACATTCAGAA\tC=CCGGGGGGGGCJ1GGJJJJ1JJJJJGJJ=GJJG8GGJ=GJGJJGJJGGGCGJGCGGGCGGG8GG=GJJGCG1GCGGJGCCGGCGGGCCGGGCG8GGGGG8C1==CGGCCCGCGGGGC8GCGGG8GGGCGCCGCCGCGGGCGGGGGGCG\tNH:i:0\tHI:i:0\tAS:i:154\tnM:i:3\tuT:A:1\n+BCR-ABL1-10\t77\t*\t0\t0\t*\t*\t0\t0\tAGGTTGGGGTCATTTTCACTGGGTCCAGCGAGAAGGTTTTCCTTGGAGTTCCAACGAGCGGCTTCACTCAGACCCTGAGGCTCAAAGTCAGATGCTACTGGCCGCTGAAGGGCTTTTGAACTCTGCTTAAATCCAGTGGCTGAGTGGACG\tCC=GGGGGGGGGG1GJJJJJCJJJJJJJJJJJGJ=GJJJGCJJJJCJGJGCJGJJJGGJJJGGCCGGJGC=GGJ1C8GGGGGGCGCCGGGGGGCGGGCGCCCG1GGCGCGCGGGCC8GCGCGCGC8CCCGCGCGGGGGCGGGGGCGGCGG\tNH:i:0\tHI:i:0\tAS:i:181\tnM:i:2\tuT:A:1\n+BCR-ABL1-10\t141\t*\t0\t0\t*\t*\t0\t0\tATAAGGAAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAGTGAAGCCGCTCGTTGGA\t1CCGGCGGGGGG1GGJJJGCC1JJJJCCG=JGGJJGJJJ=GGGGGJJGGGGGGC1J=CJGCGGGGCGC(CGGGGG=GGGGG(G=CGGCGGGGCCCGC=CCCCJJCC8G1GGGGCGGGGGGCGCGGGGGGGCG=GGCCGCCGCC1G=GGGG\tNH:i:0\tHI:i:0\tAS:i:181\tnM:i:2\tuT:A:1\n'
b
diff -r 000000000000 -r 2d4e3aff9dc7 test-data/cytobands.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cytobands.tsv Wed Jul 27 11:25:43 2022 +0000
b
@@ -0,0 +1,5 @@
+contig start end name giemsa
+22 1 40586 q11.22 gpos25
+22 40586 269079 q11.23 gneg
+9 1 21036 q34.11 gneg
+9 21036 515509 q34.12 gpos25
b
diff -r 000000000000 -r 2d4e3aff9dc7 test-data/fusions.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fusions.tsv Wed Jul 27 11:25:43 2022 +0000
b
@@ -0,0 +1,2 @@
+#gene1 gene2 strand1(gene/fusion) strand2(gene/fusion) breakpoint1 breakpoint2 site1 site2 type split_reads1 split_reads2 discordant_mates coverage1 coverage2 confidence reading_frame tags retained_protein_domains closest_genomic_breakpoint1 closest_genomic_breakpoint2 gene_id1 gene_id2 transcript_id1 transcript_id2 direction1 direction2 filters fusion_transcript peptide_sequence read_identifiers
+BCR ABL1 +/+ +/+ 22:230999 9:275100 CDS/splice-site CDS/splice-site translocation 1 3 0 3 8 low in-frame . Bcr-Abl_oncoprotein_oligomerisation_domain(100%),C2_domain(100%),RhoGEF_domain(100%)|F-actin_binding(100%),Protein_kinase_domain(100%),SH2_domain(100%),SH3_domain(100%) . . ENSG00000186716 ENSG00000097007 ENST00000305877 ENST00000372348 downstream upstream . AGCTTCTCCCTGACATCCGTGGAGCTGCAGATGCTGACCAACTCGTGTGTGAAACTCCAGACTGTCCACAGCATTCCGCTGACCATCAATAAGGAAG___ATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAA|AAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAGTGAAGCCGCTCGTTGGAACTCCAAGGAAAACCTTCTCGCTGGACCCAGTGAAAATGACCCCAACCTTTTCGTTGCACTGTATGATTTTGTGGCCAGTGGAGATAACACTCTAAGCATAACTAAAG___GTGAAAAGCTCCGGG SFSLTSVELQMLTNSCVKLQTVHSIPLTINKEDDESPGLYGFLNVIVHSATGFKQSS|kALQRPVASDFEPQGLSEAARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLR BCR-ABL1-4,BCR-ABL1-28,BCR-ABL1-60,BCR-ABL1-76
b
diff -r 000000000000 -r 2d4e3aff9dc7 test-data/genome.fasta.gz
b
Binary file test-data/genome.fasta.gz has changed
b
diff -r 000000000000 -r 2d4e3aff9dc7 test-data/genome.gtf.gz
b
Binary file test-data/genome.gtf.gz has changed
b
diff -r 000000000000 -r 2d4e3aff9dc7 test-data/protein_domains.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_domains.gff3 Wed Jul 27 11:25:43 2022 +0000
b
b'@@ -0,0 +1,83 @@\n+9\tpfam\tprotein_domain\t33502\t33541\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t33992\t34063\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t35324\t35381\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t37391\t37409\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t37479\t37553\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t38833\t38931\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t41390\t41413\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t41489\t41494\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t43744\t43846\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t44647\t44729\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t47496\t47541\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t51664\t51812\t0\t+\t.\tName=KH domain;color=#808080;gene_id=ENSG00000107164;gene_name=FUBP3;protein_domain_id=PF00013\n+9\tpfam\tprotein_domain\t102331\t102396\t0\t+\t.\tName=Zinc finger%2C C2H2 type;color=#80FF00;gene_id=ENSG00000130711;gene_name=PRDM12;protein_domain_id=PF00096\n+9\tpfam\tprotein_domain\t102412\t102480\t0\t+\t.\tName=C2H2-type zinc finger;color=#80FF80;gene_id=ENSG00000130711;gene_name=PRDM12;protein_domain_id=PF13894\n+9\tpfam\tprotein_domain\t114903\t114949\t0\t+\t.\tName=Exosome complex exonuclease RRP4 N-terminal region;color=#FF0000;gene_id=ENSG00000130713;gene_name=EXOSC2;protein_domain_id=PF14382\n+9\tpfam\tprotein_domain\t116528\t116596\t0\t+\t.\tName=Exosome complex exonuclease RRP4 N-terminal region;color=#FF0000;gene_id=ENSG00000130713;gene_name=EXOSC2;protein_domain_id=PF14382\n+9\tpfam\tprotein_domain\t121951\t121971\t0\t+\t.\tName=KH domain;color=#000080;gene_id=ENSG00000130713;gene_name=EXOSC2;protein_domain_id=PF15985\n+9\tpfam\tprotein_domain\t123179\t123300\t0\t+\t.\tName=KH domain;color=#000080;gene_id=ENSG00000130713;gene_name=EXOSC2;protein_domain_id=PF15985\n+9\tpfam\tprotein_domain\t275219\t275273\t0\t+\t.\tName=SH3 domain;color=#FF0000;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00018\n+9\tpfam\tprotein_domain\t275837\t275922\t0\t+\t.\tName=SH3 domain;color=#FF0000;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00018\n+9\tpfam\tprotein_domain\t275962\t276132\t0\t+\t.\tName=SH2 domain;color=#80FFFF;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00017\n+9\tpfam\tprotein_domain\t283799\t283855\t0\t+\t.\tName=SH2 domain;color=#80FFFF;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00017\n+9\tpfam\tprotein_domain\t283973\t284071\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t293165\t293249\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t293896\t294073\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t295904\t296088\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t299451\t299603\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;gene_name=ABL1;protein_domain_id=PF00069\n+9\tpfam\tprotein_domain\t301104\t301156\t0\t+\t.\tName=Protein kinase domain;color=#80FF00;gene_id=ENSG00000097007;ge'..b'd=PF00053\n+9\tpfam\tprotein_domain\t489945\t490067\t0\t+\t.\tName=Laminin EGF domain;color=#FFFFFF;gene_id=ENSG00000050555;gene_name=LAMC3;protein_domain_id=PF00053\n+9\tpfam\tprotein_domain\t490710\t490856\t0\t+\t.\tName=Laminin EGF domain;color=#FFFFFF;gene_id=ENSG00000050555;gene_name=LAMC3;protein_domain_id=PF00053\n+22\tpfam\tprotein_domain\t2420\t2524\t0\t-\t.\tName=Armadillo/beta-catenin-like repeat;color=#000080;gene_id=ENSG00000100218;gene_name=RSPH14;protein_domain_id=PF00514\n+22\tpfam\tprotein_domain\t36321\t37004\t0\t+\t.\tName=G-protein alpha subunit;color=#80FFFF;gene_id=ENSG00000128266;gene_name=GNAZ;protein_domain_id=PF00503\n+22\tpfam\tprotein_domain\t63673\t63981\t0\t+\t.\tName=G-protein alpha subunit;color=#80FFFF;gene_id=ENSG00000128266;gene_name=GNAZ;protein_domain_id=PF00503\n+22\tpfam\tprotein_domain\t90736\t90740\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t93060\t93112\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t93619\t93720\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t96554\t96622\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t98578\t98629\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t99484\t99565\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t99749\t99839\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t101465\t101502\t0\t+\t.\tName=Ras family;color=#80FFFF;gene_id=ENSG00000100228;gene_name=RAB36;protein_domain_id=PF00071\n+22\tpfam\tprotein_domain\t121553\t121771\t0\t+\t.\tName=Bcr-Abl oncoprotein oligomerisation domain;color=#FF0000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF09036\n+22\tpfam\tprotein_domain\t201581\t201640\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t201941\t202126\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t208994\t209101\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t212118\t212178\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t213667\t213719\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t214220\t214312\t0\t+\t.\tName=RhoGEF domain;color=#000000;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00621\n+22\tpfam\tprotein_domain\t230954\t230999\t0\t+\t.\tName=C2 domain;color=#00FF00;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00168\n+22\tpfam\tprotein_domain\t233127\t233224\t0\t+\t.\tName=C2 domain;color=#00FF00;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00168\n+22\tpfam\tprotein_domain\t235610\t235741\t0\t+\t.\tName=C2 domain;color=#00FF00;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00168\n+22\tpfam\tprotein_domain\t250010\t250018\t0\t+\t.\tName=C2 domain;color=#00FF00;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00168\n+22\tpfam\tprotein_domain\t252302\t252422\t0\t+\t.\tName=RhoGAP domain;color=#FFFFFF;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00620\n+22\tpfam\tprotein_domain\t253473\t253607\t0\t+\t.\tName=RhoGAP domain;color=#FFFFFF;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00620\n+22\tpfam\tprotein_domain\t254554\t254659\t0\t+\t.\tName=RhoGAP domain;color=#FFFFFF;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00620\n+22\tpfam\tprotein_domain\t255138\t255228\t0\t+\t.\tName=RhoGAP domain;color=#FFFFFF;gene_id=ENSG00000186716;gene_name=BCR;protein_domain_id=PF00620\n'
b
diff -r 000000000000 -r 2d4e3aff9dc7 tool-data/all_fasta.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/all_fasta.loc.sample Wed Jul 27 11:25:43 2022 +0000
b
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id> <dbkey> <display_name> <file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa
+#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa
+#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
\ No newline at end of file
b
diff -r 000000000000 -r 2d4e3aff9dc7 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Wed Jul 27 11:25:43 2022 +0000
b
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+</tables>