Repository 'extract_genomic_dna'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/extract_genomic_dna

Changeset 0:8dd8e89c0603 (2016-01-19)
Next changeset 1:9af3f57e50b9 (2016-01-20)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/extract_genomic_dna commit b'67cff25a50ba173b0468819204d0999496f68ea9'
added:
extract_genomic_dna.py
extract_genomic_dna.xml
extract_genomic_dna_utils.py
test-data/1.bed
test-data/cufflinks_out1.gtf
test-data/droPer1.bed
test-data/extract_genomic_dna_out1.fasta
test-data/extract_genomic_dna_out2.fasta
test-data/extract_genomic_dna_out3.interval
test-data/extract_genomic_dna_out4.gff
test-data/extract_genomic_dna_out5.fasta
test-data/extract_genomic_dna_out6.fasta
test-data/extract_genomic_dna_out7.fasta
test-data/gff_filter_by_attribute_out1.gff
test-data/tophat_in1.fasta
tool_data_table_conf.xml.sample
tool_dependencies.xml
twobit.loc.sample
b
diff -r 000000000000 -r 8dd8e89c0603 extract_genomic_dna.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_genomic_dna.py Tue Jan 19 09:34:23 2016 -0500
[
b'@@ -0,0 +1,207 @@\n+#!/usr/bin/env python\n+import argparse\n+import os\n+\n+import extract_genomic_dna_utils as egdu\n+import bx.seq.nib\n+import bx.seq.twobit\n+from bx.intervals.io import Header, Comment\n+\n+\n+parser = argparse.ArgumentParser()\n+parser.add_argument(\'--input_format\', dest=\'input_format\', help="Input dataset format")\n+parser.add_argument(\'--input\', dest=\'input\', help="Input dataset")\n+parser.add_argument(\'--genome\', dest=\'genome\', help="Input dataset genome build")\n+parser.add_argument(\'--interpret_features\', dest=\'interpret_features\', default=None, help="Interpret features if input format is gff")\n+parser.add_argument(\'--columns\', dest=\'columns\', help="Columns to use in input file")\n+parser.add_argument(\'--reference_genome_source\', dest=\'reference_genome_source\', help="Source of reference genome file")\n+parser.add_argument(\'--reference_genome\', dest=\'reference_genome\', help="Reference genome file")\n+parser.add_argument(\'--output_format\', dest=\'output_format\', help="Output format")\n+parser.add_argument(\'--output\', dest=\'output\', help="Output dataset")\n+args = parser.parse_args()\n+\n+input_is_gff = args.input_format == \'gff\'\n+interpret_features = input_is_gff and args.interpret_features == "yes"\n+if len(args.columns.split(\',\')) == 5:\n+    # Bed file.\n+    chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns)\n+else:\n+    # Gff file.\n+    chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns)\n+    name_col = False\n+\n+if args.reference_genome_source == "history":\n+    seq_path = egdu.convert_to_twobit(args.reference_genome)\n+else:\n+    seq_path = args.reference_genome\n+seq_dir = os.path.split(seq_path)[0]\n+\n+includes_strand_col = strand_col >= 0\n+strand = None\n+nibs = {}\n+skipped_lines = 0\n+first_invalid_line = 0\n+invalid_lines = []\n+warnings = []\n+warning = \'\'\n+twobitfile = None\n+line_count = 1\n+file_iterator = open(args.input)\n+if interpret_features:\n+    file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False)\n+out = open(args.output, \'wt\')\n+\n+for feature in file_iterator:\n+    # Ignore comments, headers.\n+    if isinstance(feature, (Header, Comment)):\n+        line_count += 1\n+        continue\n+    name = ""\n+    if interpret_features:\n+        # Processing features.\n+        egdu.convert_gff_coords_to_bed(feature)\n+        chrom = feature.chrom\n+        start = feature.start\n+        end = feature.end\n+        strand = feature.strand\n+    else:\n+        # Processing lines, either interval or GFF format.\n+        line = feature.rstrip(\'\\r\\n\')\n+        if line and not line.startswith("#"):\n+            fields = line.split(\'\\t\')\n+            try:\n+                chrom = fields[chrom_col]\n+                start = int(fields[start_col])\n+                end = int(fields[end_col])\n+                if name_col:\n+                    name = fields[name_col]\n+                if input_is_gff:\n+                    start, end = egdu.convert_gff_coords_to_bed([start, end])\n+                if includes_strand_col:\n+                    strand = fields[strand_col]\n+            except:\n+                warning = "Invalid chrom, start or end column values. "\n+                warnings.append(warning)\n+                if not invalid_lines:\n+                    invalid_lines = egdu.get_lines(feature)\n+                    first_invalid_line = line_count\n+                skipped_lines += len(invalid_lines)\n+                continue\n+            if start > end:\n+                warning = "Invalid interval, start \'%d\' > end \'%d\'.  " % (start, end)\n+                warnings.append(warning)\n+                if not invalid_lines:\n+                    invalid_lines = egdu.get_lines(feature)\n+                    first_invalid_line = line_count\n+                skipped_lines += len(invalid_lines)\n+                continue\n+            if strand not in [\'+\', \'-\']:\n+                strand = \'+\'\n+            sequence = \'\'\n+        else:\n+            continue\n+    # Open sequence f'..b'rt, end - start, args.genome)\n+            warnings.append(warning)\n+            if not invalid_lines:\n+                invalid_lines = egdu.get_lines(feature)\n+                first_invalid_line = line_count\n+            skipped_lines += len(invalid_lines)\n+            continue\n+    elif os.path.isfile(seq_path):\n+        if not(twobitfile):\n+            twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path))\n+        try:\n+            if interpret_features:\n+                # Create sequence from intervals within a feature.\n+                sequence = \'\'\n+                for interval in feature.intervals:\n+                    sequence += twobitfile[interval.chrom][interval.start:interval.end]\n+            else:\n+                sequence = twobitfile[chrom][start:end]\n+        except:\n+            warning = "Unable to fetch the sequence from \'%d\' to \'%d\' for chrom \'%s\'. " % (start, end - start, chrom)\n+            warnings.append(warning)\n+            if not invalid_lines:\n+                invalid_lines = egdu.get_lines(feature)\n+                first_invalid_line = line_count\n+            skipped_lines += len(invalid_lines)\n+            continue\n+    else:\n+        warning = "Chromosome by name \'%s\' was not found for build \'%s\'. " % (chrom, args.genome)\n+        warnings.append(warning)\n+        if not invalid_lines:\n+            invalid_lines = egdu.get_lines(feature)\n+            first_invalid_line = line_count\n+        skipped_lines += len(invalid_lines)\n+        continue\n+    if sequence == \'\':\n+        warning = "Chrom: \'%s\', start: \'%d\', end: \'%d\' is either invalid or not present in build \'%s\'. " % (chrom, start, end, args.genome)\n+        warnings.append(warning)\n+        if not invalid_lines:\n+            invalid_lines = egdu.get_lines(feature)\n+            first_invalid_line = line_count\n+        skipped_lines += len(invalid_lines)\n+        continue\n+    if includes_strand_col and strand == "-":\n+        sequence = egdu.reverse_complement(sequence)\n+    if args.output_format == "fasta":\n+        l = len(sequence)\n+        c = 0\n+        if input_is_gff:\n+            start, end = egdu.convert_bed_coords_to_gff([start, end])\n+        fields = [args.genome, str(chrom), str(start), str(end), strand]\n+        meta_data = "_".join(fields)\n+        if name.strip():\n+            out.write(">%s %s\\n" % (meta_data, name))\n+        else:\n+            out.write(">%s\\n" % meta_data)\n+        while c < l:\n+            b = min(c + 50, l)\n+            out.write("%s\\n" % str(sequence[c:b]))\n+            c = b\n+    else:\n+        # output_format == "interval".\n+        if interpret_features:\n+            meta_data = "\\t".join([feature.chrom,\n+                                   "galaxy_extract_genomic_dna",\n+                                   "interval",\n+                                   str(feature.start),\n+                                   str(feature.end),\n+                                   feature.score,\n+                                   feature.strand,\n+                                   ".",\n+                                   egdu.gff_attributes_to_str(feature.attributes, "GTF")])\n+        else:\n+            # Where is fields being set here?\n+            meta_data = "\\t".join(fields)\n+        if input_is_gff:\n+            format_str = "%s seq \\"%s\\";\\n"\n+        else:\n+            format_str = "%s\\t%s\\n"\n+        out.write(format_str % (meta_data, str(sequence)))\n+    # Update line count.\n+    if isinstance(feature, egdu.GFFFeature):\n+        line_count += len(feature.intervals)\n+    else:\n+        line_count += 1\n+out.close()\n+\n+if warnings:\n+    warn_msg = "%d warnings, 1st is: " % len(warnings)\n+    warn_msg += warnings[0]\n+    print warn_msg\n+if skipped_lines:\n+    # Error message includes up to the first 10 skipped lines.\n+    print \'Skipped %d invalid lines, 1st is #%d, "%s"\' % (skipped_lines, first_invalid_line, \'\\n\'.join(invalid_lines[:10]))\n+\n+if args.reference_genome_source == "history":\n+    os.remove(seq_path)\n'
b
diff -r 000000000000 -r 8dd8e89c0603 extract_genomic_dna.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_genomic_dna.xml Tue Jan 19 09:34:23 2016 -0500
[
b'@@ -0,0 +1,202 @@\n+<tool id="Extract genomic DNA 1" name="Extract Genomic DNA" version="3.0.0">\n+    <description>using coordinates from assembled/unassembled genomes</description>\n+    <requirements>\n+        <requirement type="package" version="0.7.1">bx-python</requirement>\n+        <requirement type="package" version="35x1">faToTwoBit</requirement>\n+    </requirements>\n+    <command>\n+        <![CDATA[\n+            #set genome = $input.metadata.dbkey\n+            #set datatype = $input.datatype\n+            mkdir -p output_dir &&\n+            python $__tool_directory__/extract_genomic_dna.py\n+            --input "$input"\n+            --genome "$genome"\n+            #if $input.is_of_type("gff"):\n+                --input_format "gff"\n+                --columns "1,4,5,7"\n+                --interpret_features $interpret_features\n+            #else:\n+                --input_format "interval"\n+                --columns "${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol},${input.metadata.nameCol}"\n+            #end if\n+            --reference_genome_source $reference_genome_cond.reference_genome_source\n+            #if str($reference_genome_cond.reference_genome_source) == "cached"\n+                --reference_genome $reference_genome_cond.reference_genome.fields.path\n+            #else:\n+                --reference_genome $reference_genome_cond.reference_genome\n+            #end if\n+            --output_format $output_format\n+            --output $output\n+        ]]>\n+    </command>\n+    <inputs>\n+        <param name="input" type="data" format="gff,interval" label="Fetch sequences for intervals in">\n+            <validator type="unspecified_build" />\n+        </param>\n+        <param name="interpret_features" type="select" label="Interpret features when possible" help="Applicable only when input dataset format is in the gff family">\n+            <option value="yes">Yes</option>\n+            <option value="no">No</option>\n+        </param>\n+        <conditional name="reference_genome_cond">\n+            <param name="reference_genome_source" type="select" label="Choose the source for the reference genome">\n+                <option value="cached">locally cached</option>\n+                <option value="history">from history</option>\n+            </param>\n+            <when value="cached">\n+                <param name="reference_genome" type="select" label="Using reference genome">\n+                    <options from_data_table="twobit">\n+                        <filter type="data_meta" key="dbkey" ref="input" column="0"/>\n+                    </options>\n+                    <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>\n+                </param>\n+            </when>\n+            <when value="history">\n+                <param name="reference_genome" type="data" format="fasta" label="Using reference genome">\n+                    <options>\n+                        <filter type="data_meta" key="dbkey" ref="input"/>\n+                    </options>\n+                    <validator type="no_options" message="The current history does not include a fasta dataset with the build associated with the selected input file"/>\n+                </param>\n+            </when>\n+        </conditional>\n+        <param name="output_format" type="select" label="Select output format">\n+            <option value="fasta" selected="True">fasta</option>\n+            <option value="interval">interval</option>\n+        </param>\n+    </inputs>\n+    <outputs>\n+        <data name="output" format="gff">\n+            <change_format>\n+                <when output_format="interval" format="interval" />\n+            </change_format>\n+        </data>\n+    </outputs>\n+    <tests>\n+        <test>\n+            <param name="input" value="1.bed" dbkey="hg17" ftype="bed" />\n+            <param name="interpret_features" value="yes"/>\n'..b'      <param name="interpret_features" value="no"/>\n+            <param name="index_source" value="history"/>\n+            <param name="ref_file" value="tophat_in1.fasta"/>\n+            <param name="out_format" value="fasta"/>\n+            <output name="out_file1" file="extract_genomic_dna_out6.fasta" compare="contains" />\n+        </test>\n+        <test>\n+            <param name="input" value="cufflinks_out1.gtf" dbkey="mm9" ftype="gff" />\n+            <param name="interpret_features" value="yes"/>\n+            <param name="index_source" value="history"/>\n+            <param name="ref_file" value="tophat_in1.fasta"/>\n+            <param name="out_format" value="fasta"/>\n+            <output name="out_file1" file="extract_genomic_dna_out7.fasta" compare="contains" />\n+        </test>\n+    </tests>\n+    <help>\n+\n+.. class:: warningmark\n+\n+This tool requires interval or gff (special tabular formatted data).  If your data is not TAB delimited, first use *Text Manipulation-&gt;Convert*.\n+\n+.. class:: warningmark\n+\n+Make sure that the genome build is specified for the dataset from which you are extracting sequences (click the pencil icon in the history item if it is not specified). \n+\n+.. class:: warningmark\n+\n+All of the following will cause a line from the input dataset to be skipped and a warning generated.  The number of warnings and skipped lines is documented in the resulting history item.\n+ - Any lines that do not contain at least 3 columns, a chromosome and numerical start and end coordinates.\n+ - Sequences that fall outside of the range of a line\'s start and end coordinates. \n+ - Chromosome, start or end coordinates that are invalid for the specified build.\n+ - Any lines whose data columns are not separated by a **TAB** character ( other white-space characters are invalid ).\n+\n+.. class:: infomark\n+\n+ **Extract genomic DNA using coordinates from ASSEMBLED genomes and UNassembled genomes** previously were achieved by two separate tools. \n+\n+-----\n+\n+**What it does**\n+\n+This tool uses coordinate, strand, and build information to fetch genomic DNAs in FASTA or interval format.\n+\n+If strand is not defined, the default value is "+".\n+\n+-----\n+\n+**Example**\n+\n+If the input dataset is::\n+\n+    chr7  127475281  127475310  NM_000230  0  +\n+    chr7  127485994  127486166  NM_000230  0  +\n+    chr7  127486011  127486166  D49487     0  +\n+\n+Extracting sequences with **FASTA** output data type returns::\n+\n+    &gt;hg17_chr7_127475281_127475310_+ NM_000230\n+    GTAGGAATCGCAGCGCCAGCGGTTGCAAG\n+    &gt;hg17_chr7_127485994_127486166_+ NM_000230\n+    GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCG\n+    GATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATC\n+    CAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAG\n+    GATCAATGACATTTCACACACG\n+    &gt;hg17_chr7_127486011_127486166_+ D49487\n+    TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGG\n+    CCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGA\n+    CACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCAC\n+    ACACG\n+\n+Extracting sequences with **Interval** output data type returns::\n+\n+    chr7    127475281       127475310       NM_000230       0       +       GTAGGAATCGCAGCGCCAGCGGTTGCAAG\n+    chr7    127485994       127486166       NM_000230       0       +       GCCCAAGAAGCCCATCCTGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG\n+    chr7    127486011       127486166       D49487  0       +       TGGGAAGGAAAATGCATTGGGGAACCCTGTGCGGATTCTTGTGGCTTTGGCCCTATCTTTTCTATGTCCAAGCTGTGCCCATCCAAAAAGTCCAAGATGACACCAAAACCCTCATCAAGACAATTGTCACCAGGATCAATGACATTTCACACACG\n+\n+    </help>\n+    <citations>\n+        <citation type="bibtex">\n+            @unpublished{None,\n+            author = {Guru Ananda,Greg Von Kuster},\n+            title = {None},\n+            year = {None},\n+            eprint = {None},\n+            url = {http://www.bx.psu.edu/~anton/labSite/}\n+        }</citation>\n+    </citations>\n+</tool>\n'
b
diff -r 000000000000 -r 8dd8e89c0603 extract_genomic_dna_utils.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_genomic_dna_utils.py Tue Jan 19 09:34:23 2016 -0500
[
b'@@ -0,0 +1,391 @@\n+import copy\n+import os\n+import subprocess\n+import sys\n+import tempfile\n+\n+from bx.intervals.io import Comment, Header, GenomicInterval\n+from bx.intervals.io import GenomicIntervalReader, NiceReaderWrapper, ParseError\n+\n+# Default chrom, start, end, strand cols for a bed file\n+BED_DEFAULT_COLS = 0, 1, 2, 5\n+\n+\n+class GFFInterval(GenomicInterval):\n+    """\n+    A GFF interval, including attributes. If file is strictly a GFF file,\n+    only attribute is \'group.\'\n+    """\n+\n+    def __init__(self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,\n+                 strand_col=6, score_col=5, default_strand=\'.\', fix_strand=False):\n+        # GFF format allows \'.\' for strand but GenomicInterval does not. To get around this,\n+        # temporarily set strand and then unset after initing GenomicInterval.\n+        unknown_strand = False\n+        if not fix_strand and fields[strand_col] == \'.\':\n+            unknown_strand = True\n+            fields[strand_col] = \'+\'\n+        GenomicInterval.__init__(self, reader, fields, chrom_col, start_col, end_col,\n+                                 strand_col, default_strand, fix_strand=fix_strand)\n+        if unknown_strand:\n+            self.strand = \'.\'\n+            self.fields[strand_col] = \'.\'\n+        # Handle feature, score column.\n+        self.feature_col = feature_col\n+        if self.feature_col >= self.nfields:\n+            stop_err("No field for feature_col (%d)" % feature_col)\n+        self.feature = self.fields[self.feature_col]\n+        self.score_col = score_col\n+        if self.score_col >= self.nfields:\n+            stop_err("No field for score_col (%d)" % score_col)\n+        self.score = self.fields[self.score_col]\n+        # GFF attributes.\n+        self.attributes = parse_gff_attributes(fields[8])\n+\n+    def copy(self):\n+        return GFFInterval(self.reader, list(self.fields), self.chrom_col, self.feature_col,\n+                           self.start_col, self.end_col, self.strand_col, self.score_col, self.strand)\n+\n+\n+class GFFFeature(GFFInterval):\n+    """\n+    A GFF feature, which can include multiple intervals.\n+    """\n+\n+    def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6,\n+                 score_col=5, default_strand=\'.\', fix_strand=False, intervals=[], raw_size=0):\n+        # Use copy so that first interval and feature do not share fields.\n+        GFFInterval.__init__(self, reader, copy.deepcopy(intervals[0].fields), chrom_col, feature_col,\n+                             start_col, end_col, strand_col, score_col, default_strand, fix_strand=fix_strand)\n+        self.intervals = intervals\n+        self.raw_size = raw_size\n+        # Use intervals to set feature attributes.\n+        for interval in self.intervals:\n+            # Error checking. NOTE: intervals need not share the same strand.\n+            if interval.chrom != self.chrom:\n+                stop_err("interval chrom does not match self chrom: %s != %s" % (interval.chrom, self.chrom))\n+            # Set start, end of interval.\n+            if interval.start < self.start:\n+                self.start = interval.start\n+            if interval.end > self.end:\n+                self.end = interval.end\n+\n+    def name(self):\n+        """\n+        Returns feature\'s name.\n+        """\n+        name = None\n+        # Preference for name:\n+        # GTF: \'gene_id\', \'transcript_id\'\n+        # GFF3: \'ID\', \'id\'\n+        # GFF: \'group\'\n+        for attr_name in [\'gene_id\', \'transcript_id\', \'ID\', \'id\', \'group\']:\n+            name = self.attributes.get(attr_name, None)\n+            if name is not None:\n+                break\n+        return name\n+\n+    def copy(self):\n+        intervals_copy = []\n+        for interval in self.intervals:\n+            intervals_copy.append(interval.copy())\n+        return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col,\n+                          self.strand_col, self.score_col, s'..b'it()\n+        tmp_stderr.close()\n+        if returncode != 0:\n+            # Get stderr, allowing for case where it\'s very large.\n+            tmp_stderr = open(tmp_name, \'rb\')\n+            stderr = \'\'\n+            buffsize = 1048576\n+            try:\n+                while True:\n+                    stderr += tmp_stderr.read(buffsize)\n+                    if not stderr or len(stderr) % buffsize != 0:\n+                        break\n+            except OverflowError:\n+                pass\n+            tmp_stderr.close()\n+            os.remove(tmp_name)\n+            stop_err(stderr)\n+        return seq_path\n+    except Exception, e:\n+        stop_err(\'Error running faToTwoBit. \' + str(e))\n+\n+\n+def get_lines(feature):\n+    # Get feature\'s line(s).\n+    if isinstance(feature, GFFFeature):\n+        return feature.lines()\n+    else:\n+        return [feature.rstrip(\'\\r\\n\')]\n+\n+\n+def gff_attributes_to_str(attrs, gff_format):\n+    """\n+    Convert GFF attributes to string. Supported formats are GFF3, GTF.\n+    """\n+    if gff_format == \'GTF\':\n+        format_string = \'%s "%s"\'\n+        # Convert group (GFF) and ID, parent (GFF3) attributes to\n+        # transcript_id, gene_id.\n+        id_attr = None\n+        if \'group\' in attrs:\n+            id_attr = \'group\'\n+        elif \'ID\' in attrs:\n+            id_attr = \'ID\'\n+        elif \'Parent\' in attrs:\n+            id_attr = \'Parent\'\n+        if id_attr:\n+            attrs[\'transcript_id\'] = attrs[\'gene_id\'] = attrs[id_attr]\n+    elif gff_format == \'GFF3\':\n+        format_string = \'%s=%s\'\n+    attrs_strs = []\n+    for name, value in attrs.items():\n+        attrs_strs.append(format_string % (name, value))\n+    return " ; ".join(attrs_strs)\n+\n+\n+def parse_cols_arg(cols):\n+    """\n+    Parse a columns command line argument into a four-tuple.\n+    """\n+    if cols:\n+        # Handle case where no strand column included - in this case, cols\n+        # looks something like 1,2,3,\n+        if cols.endswith(\',\'):\n+            cols += \'0\'\n+        col_list = map(lambda x: int(x) - 1, cols.split(","))\n+        return col_list\n+    else:\n+        return BED_DEFAULT_COLS\n+\n+\n+def parse_gff_attributes(attr_str):\n+    """\n+    Parses a GFF/GTF attribute string and returns a dictionary of name-value\n+    pairs. The general format for a GFF3 attributes string is\n+        name1=value1;name2=value2\n+    The general format for a GTF attribute string is\n+        name1 "value1" ; name2 "value2"\n+    The general format for a GFF attribute string is a single string that\n+    denotes the interval\'s group; in this case, method returns a dictionary\n+    with a single key-value pair, and key name is \'group\'.\n+    """\n+    attributes_list = attr_str.split(";")\n+    attributes = {}\n+    for name_value_pair in attributes_list:\n+        # Try splitting by \'=\' (GFF3) first because spaces are allowed in GFF3\n+        # attribute; next, try double quotes for GTF.\n+        pair = name_value_pair.strip().split("=")\n+        if len(pair) == 1:\n+            pair = name_value_pair.strip().split("\\"")\n+        if len(pair) == 1:\n+            # Could not split for some reason.\n+            continue\n+        if pair == \'\':\n+            continue\n+        name = pair[0].strip()\n+        if name == \'\':\n+            continue\n+        # Need to strip double quote from values\n+        value = pair[1].strip(" \\"")\n+        attributes[name] = value\n+    if len(attributes) == 0:\n+        # Could not split attributes string, so entire string must be\n+        # \'group\' attribute. This is the case for strictly GFF files.\n+        attributes[\'group\'] = attr_str\n+    return attributes\n+\n+\n+def reverse_complement(s):\n+    complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"}\n+    reversed_s = []\n+    for i in s:\n+        reversed_s.append(complement_dna[i])\n+    reversed_s.reverse()\n+    return "".join(reversed_s)\n+\n+\n+def stop_err(msg):\n+    sys.stderr.write(msg)\n+    sys.exit(1)\n'
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/1.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/1.bed Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,65 @@
+chr1 147962192 147962580 CCDS989.1_cds_0_0_chr1_147962193_r 0 -
+chr1 147984545 147984630 CCDS990.1_cds_0_0_chr1_147984546_f 0 +
+chr1 148078400 148078582 CCDS993.1_cds_0_0_chr1_148078401_r 0 -
+chr1 148185136 148185276 CCDS996.1_cds_0_0_chr1_148185137_f 0 +
+chr10 55251623 55253124 CCDS7248.1_cds_0_0_chr10_55251624_r 0 -
+chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 -
+chr11 116206508 116206563 CCDS8377.1_cds_0_0_chr11_116206509_f 0 +
+chr11 116211733 116212337 CCDS8378.1_cds_0_0_chr11_116211734_r 0 -
+chr11 1812377 1812407 CCDS7726.1_cds_0_0_chr11_1812378_f 0 +
+chr12 38440094 38440321 CCDS8736.1_cds_0_0_chr12_38440095_r 0 -
+chr13 112381694 112381953 CCDS9526.1_cds_0_0_chr13_112381695_f 0 +
+chr14 98710240 98712285 CCDS9949.1_cds_0_0_chr14_98710241_r 0 -
+chr15 41486872 41487060 CCDS10096.1_cds_0_0_chr15_41486873_r 0 -
+chr15 41673708 41673857 CCDS10097.1_cds_0_0_chr15_41673709_f 0 +
+chr15 41679161 41679250 CCDS10098.1_cds_0_0_chr15_41679162_r 0 -
+chr15 41826029 41826196 CCDS10101.1_cds_0_0_chr15_41826030_f 0 +
+chr16 142908 143003 CCDS10397.1_cds_0_0_chr16_142909_f 0 +
+chr16 179963 180135 CCDS10401.1_cds_0_0_chr16_179964_r 0 -
+chr16 244413 244681 CCDS10402.1_cds_0_0_chr16_244414_f 0 +
+chr16 259268 259383 CCDS10403.1_cds_0_0_chr16_259269_r 0 -
+chr18 23786114 23786321 CCDS11891.1_cds_0_0_chr18_23786115_r 0 -
+chr18 59406881 59407046 CCDS11985.1_cds_0_0_chr18_59406882_f 0 +
+chr18 59455932 59456337 CCDS11986.1_cds_0_0_chr18_59455933_r 0 -
+chr18 59600586 59600754 CCDS11988.1_cds_0_0_chr18_59600587_f 0 +
+chr19 59068595 59069564 CCDS12866.1_cds_0_0_chr19_59068596_f 0 +
+chr19 59236026 59236146 CCDS12872.1_cds_0_0_chr19_59236027_r 0 -
+chr19 59297998 59298008 CCDS12877.1_cds_0_0_chr19_59297999_f 0 +
+chr19 59302168 59302288 CCDS12878.1_cds_0_0_chr19_59302169_r 0 -
+chr2 118288583 118288668 CCDS2120.1_cds_0_0_chr2_118288584_f 0 +
+chr2 118394148 118394202 CCDS2121.1_cds_0_0_chr2_118394149_r 0 -
+chr2 220190202 220190242 CCDS2441.1_cds_0_0_chr2_220190203_f 0 +
+chr2 220229609 220230869 CCDS2443.1_cds_0_0_chr2_220229610_r 0 -
+chr20 33330413 33330423 CCDS13249.1_cds_0_0_chr20_33330414_r 0 -
+chr20 33513606 33513792 CCDS13255.1_cds_0_0_chr20_33513607_f 0 +
+chr20 33579500 33579527 CCDS13256.1_cds_0_0_chr20_33579501_r 0 -
+chr20 33593260 33593348 CCDS13257.1_cds_0_0_chr20_33593261_f 0 +
+chr21 32707032 32707192 CCDS13614.1_cds_0_0_chr21_32707033_f 0 +
+chr21 32869641 32870022 CCDS13615.1_cds_0_0_chr21_32869642_r 0 -
+chr21 33321040 33322012 CCDS13620.1_cds_0_0_chr21_33321041_f 0 +
+chr21 33744994 33745040 CCDS13625.1_cds_0_0_chr21_33744995_r 0 -
+chr22 30120223 30120265 CCDS13897.1_cds_0_0_chr22_30120224_f 0 +
+chr22 30160419 30160661 CCDS13898.1_cds_0_0_chr22_30160420_r 0 -
+chr22 30665273 30665360 CCDS13901.1_cds_0_0_chr22_30665274_f 0 +
+chr22 30939054 30939266 CCDS13903.1_cds_0_0_chr22_30939055_r 0 -
+chr5 131424298 131424460 CCDS4149.1_cds_0_0_chr5_131424299_f 0 +
+chr5 131556601 131556672 CCDS4151.1_cds_0_0_chr5_131556602_r 0 -
+chr5 131621326 131621419 CCDS4152.1_cds_0_0_chr5_131621327_f 0 +
+chr5 131847541 131847666 CCDS4155.1_cds_0_0_chr5_131847542_r 0 -
+chr6 108299600 108299744 CCDS5061.1_cds_0_0_chr6_108299601_r 0 -
+chr6 108594662 108594687 CCDS5063.1_cds_0_0_chr6_108594663_f 0 +
+chr6 108640045 108640151 CCDS5064.1_cds_0_0_chr6_108640046_r 0 -
+chr6 108722976 108723115 CCDS5067.1_cds_0_0_chr6_108722977_f 0 +
+chr7 113660517 113660685 CCDS5760.1_cds_0_0_chr7_113660518_f 0 +
+chr7 116512159 116512389 CCDS5771.1_cds_0_0_chr7_116512160_r 0 -
+chr7 116714099 116714152 CCDS5773.1_cds_0_0_chr7_116714100_f 0 +
+chr7 116945541 116945787 CCDS5774.1_cds_0_0_chr7_116945542_r 0 -
+chr8 118881131 118881317 CCDS6324.1_cds_0_0_chr8_118881132_r 0 -
+chr9 128764156 128764189 CCDS6914.1_cds_0_0_chr9_128764157_f 0 +
+chr9 128787519 128789136 CCDS6915.1_cds_0_0_chr9_128787520_r 0 -
+chr9 128882427 128882523 CCDS6917.1_cds_0_0_chr9_128882428_f 0 +
+chr9 128937229 128937445 CCDS6919.1_cds_0_0_chr9_128937230_r 0 -
+chrX 122745047 122745924 CCDS14606.1_cds_0_0_chrX_122745048_f 0 +
+chrX 152648964 152649196 CCDS14733.1_cds_0_0_chrX_152648965_r 0 -
+chrX 152691446 152691471 CCDS14735.1_cds_0_0_chrX_152691447_f 0 +
+chrX 152694029 152694263 CCDS14736.1_cds_0_0_chrX_152694030_r 0 -
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/cufflinks_out1.gtf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cufflinks_out1.gtf Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,4 @@
+test_chromosome Cufflinks transcript 53 550 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; FPKM "10679134.4063403048"; frac "1.000000"; conf_lo "8543307.525072"; conf_hi "12814961.287608"; cov "145.770185";
+test_chromosome Cufflinks exon 53 250 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; exon_number "1"; FPKM "10679134.4063403048"; frac "1.000000"; conf_lo "8543307.525072"; conf_hi "12814961.287608"; cov "145.770185";
+test_chromosome Cufflinks exon 351 400 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; exon_number "2"; FPKM "10679134.4063403048"; frac "1.000000"; conf_lo "8543307.525072"; conf_hi "12814961.287608"; cov "145.770185";
+test_chromosome Cufflinks exon 501 550 1000 + . gene_id "CUFF.1"; transcript_id "CUFF.1.1"; exon_number "3"; FPKM "10679134.4063403048"; frac "1.000000"; conf_lo "8543307.525072"; conf_hi "12814961.287608"; cov "145.770185";
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/droPer1.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/droPer1.bed Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,2 @@
+super_1 139823 139913 AK028861 0 - 139823 139913 0 1 90, 0,
+super_1 156750 156844 BC126698 0 - 156750 156844 0 1 94, 0,
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/extract_genomic_dna_out1.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extract_genomic_dna_out1.fasta Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,88 @@
+>hg17_chr1_147962192_147962580_-
+ACTTGATCCTGCTCCCTCGGTGTCTGCATTGACTCCTCATGCTGGGACTG
+GACCCGTCAACCCCCCTGCTCGCTGCTCACGTACCTTCATCACTTTTAGT
+GATGATGCAACTTTCGAGGAATGGTTCCCCCAAGGGCGGCCCCCAAAAGT
+CCCTGTTCGTGAGGTCTGTCCAGTGACCCATCGTCCAGCCCTATACCGGG
+ACCCTGTTACAGACATACCCTATGCCACTGCTCGAGCCTTCAAGATCATT
+CGTGAGGCTTACAAGAAGTACATTACTGCCCATGGACTGCCGCCCACTGC
+CTCAGCCCTGGGCCCCGGCCCGCCACCTCCTGAGCCCCTCCCTGGCTCTG
+GGCCCCGAGCCTTGCGCCAGAAAATTGTCATTAAATGA
+>hg17_chr1_147984545_147984630_+
+ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTT
+TGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG
+>hg17_chr1_148078400_148078582_-
+GTTCTCAGCTTCCTTGCTTCCATGGCTCCAGCACCATTCGAAACCTCAAA
+GAGAGGTTCCACATGAGCATGACTGAGGAGCAGCTGCAGCTGCTGGTGGA
+GCAGATGGTGGATGGCAGTATGCGGTCTATCACCACCAAACTCTATGACG
+GCTTCCAGTACCTCACCAACGGCATCATGTGA
+>hg17_chr1_148185136_148185276_+
+ATGGAAGCGTTTTTGGGGTCGCGGTCCGGACTTTGGGCGGGGGGTCCGGC
+CCCAGGACAGTTTTACCGCATTCCGTCCACTCCCGATTCCTTCATGGATC
+CGGCGTCTGCACTTTACAGAGGTCCAATCACGCGGACCCA
+>hg17_chr10_55251623_55253124_-
+TCTTTTCCTTCTCTACCATTTTCAACAAAGCAGGGGAAATAACTCAGTCT
+CAGAAGACAGGAAACATCAACAAGTTGTGATGCCCTTTTCTTCCAATACT
+ATTGAGGCTCACAAGTCAGCTCATGTAGACGGATCACTTAAGAGCAACAA
+ACTGAAGTCTGCAAGAAAATTCACATTTCTATCTGATGAGGATGACTTAA
+GTGCCCATAATCCCCTTTATAAGGAAAACATAAGTCAAGTATCAACAAAT
+TCAGACATTTCACAGAGAACAGATTTTGTAGACCCATTTTCACCCAAAAT
+ACAAGCCAAGAGTAAGTCTCTGAGGGGCCCAAGAGAAAAGATTCAGAGGC
+TGTGGAGTCAGTCAGTCAGCTTACCCAGGAGGCTGATGAGGAAAGTTCCA
+AATAGACCAGAGATCATAGATCTGCAGCAGTGGCAAGGCACCAGGCAGAA
+AGCTGAAAATGAAAACACTGGAATCTGTACAAACAAAAGAGGTAGCAGCA
+ATCCATTGCTTACAACTGAAGAGGCAAATTTGACAGAGAAAGAGGAAATA
+AGGCAAGGTGAAACACTGATGATAGAAGGAACAGAACAGTTGAAATCTCT
+CTCTTCAGACTCTTCATTTTGCTTTCCCAGGCCTCACTTCTCATTCTCCA
+CTTTGCCAACTGTTTCAAGAACTGTGGAACTCAAATCAGAACCTAATGTC
+ATCAGTTCTCCTGCTGAGTGTTCCTTGGAACTTTCTCCTTCAAGGCCTTG
+TGTTTTACATTCTTCACTCTCTAGGAGAGAGACACCTATTTGTATGTTAC
+CTATTGAAACCGAAAGAAATATTTTTGAAAATTTTGCCCATCCACCAAAC
+ATCTCTCCTTCTGCCTGtccccttccccctcctcctcctatttctcctcc
+ttctcctcctcctgctcctgctcctcttgctcctcctcctgacatttctc
+ctttttctcttttttgtcctcctccctctcctccttctatccctcttcct
+cttcctcctcctACATTTTTTCCACTTTCCGTTTCAACGTCTGGTCCCCC
+AACAccacctcttctacctccatttccaactcctcttcctccaccacctc
+cttctattccttgccctccacctccttcAGCTTCATTTCTGTCCACAGAG
+TGTGTCTGTATAACAGGTGTTAAATGCACGACCAACTTGATGCCTGCCGA
+GAAAATTAAGTCCTCTATGACACAGCTATCAACAACGACAGTGTGTAAAA
+CAGACCCTCAGAGAGAACCAAAAGGCATCCTCAGACACGTTAAAAACTTA
+GCAGAACTTGAAAAATCAGTAGCTAACATGTACAGTCAAATAGAAAAAAA
+CTATCTACGCACAAATGTTTCAGAACTTCAAACTATGTGCCCTTCAGAAG
+TAACAAATATGGAAATCACATCTGAACAAAACAAGGGGAGTTTGAACAAT
+ATTGTCGAGGGAACTGAAAAACAATCTCACAGTCAATCTACTTCACTGTA
+A
+>hg17_chr11_116124407_116124501_-
+ATCCAATGGATTTGAACAGAAGCGCTTTGCCAGGCTTGCCAGCAAGAAGG
+CAGTGGAGGAACTTGCCTACAAATGGAGTGTTGAGGATATGTAA
+>hg17_chr11_116206508_116206563_+
+ATGCAGCCCCGGGTACTCCTTGTTGTTGCCCTCCTGGCGCTCCTGGCCTC
+TGCCC
+>hg17_chr11_116211733_116212337_-
+CCTAAAGCTCCTTGACAACTGGGACAGCGTGACCTCCACCTTCAGCAAGC
+TGCGCGAACAGCTCGGCCCTGTGACCCAGGAGTTCTGGGATAACCTGGAA
+AAGGAGACAGAGGGCCTGAGGCAGGAGATGAGCAAGGATCTGGAGGAGGT
+GAAGGCCAAGGTGCAGCCCTACCTGGACGACTTCCAGAAGAAGTGGCAGG
+AGGAGATGGAGCTCTACCGCCAGAAGGTGGAGCCGCTGCGCGCAGAGCTC
+CAAGAGGGCGCGCGCCAGAAGCTGCACGAGCTGCAAGAGAAGCTGAGCCC
+ACTGGGCGAGGAGATGCGCGACCGCGCGCGCGCCCATGTGGACGCGCTGC
+GCACGCATCTGGCCCCCTACAGCGACGAGCTGCGCCAGCGCTTGGCCGCG
+CGCCTTGAGGCTCTCAAGGAGAACGGCGGCGCCAGACTGGCCGAGTACCA
+CGCCAAGGCCACCGAGCATCTGAGCACGCTCAGCGAGAAGGCCAAGCCCG
+CGCTCGAGGACCTCCGCCAAGGCCTGCTGCCCGTGCTGGAGAGCTTCAAG
+GTCAGCTTCCTGAGCGCTCTCGAGGAGTACACTAAGAAGCTCAACACCCA
+GTGA
+>hg17_chr11_1812377_1812407_+
+ATGCTCCACCTGCATGGCTGGCAAACCATG
+>hg17_chr12_38440094_38440321_-
+GAGCTTTCTTCCTCTATGCTGGATTTGCTGCTGTGGGACTCCTTTTCATC
+TATGGCTGTCTTCCTGAGACCAAAGGCAAAAAATTAGAGGAAATTGAATC
+ACTCTTTGACAACAGGCTATGTACATGTGGCACTTCAGATTCTGATGAAG
+GGAGATATATTGAATATATTCGGGTAAAGGGAAGTAACTATCATCTTTCT
+GACAATGATGCTTCTGATGTGGAATAA
+>hg17_chr13_112381694_112381953_+
+ATGAACTCACCAGAGGCGAGGCTCTGCGTTGCTCAATGCAGAGACTCTTA
+CCCAGGGTGTCAGCCTCTGAAAGATACACGTGCCTGGGCCTCTTCCCTGA
+AGATGGACCCGGCAGGTCTGGAGGGAGGCCCCCGTGATGAATCCCGTGAT
+GAGCCGCCGATCCGAGCTCAGGCTGCGTCATGGGACCAGCCACAAGGTTG
+CCTGACCTATAAAGGTCGCAGGAGTGCCTCAGGGACACAGAAGCAGTTAC
+AGCTGCCAG
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/extract_genomic_dna_out2.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extract_genomic_dna_out2.fasta Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,6 @@
+>droPer1_super_1_139823_139913_- AK028861
+CGTCGGCTTCTGCTTCTGCTGATGATGGTCGTTCTTCTTCCTTTACTTCT
+TCCTATTTTTCTTCCTTCCCTTACACTATATCTTCCTTTA
+>droPer1_super_1_156750_156844_- BC126698
+CCGGGCTGCGGCAAGGGATTCACCTGCTCCAAACAGCTCAAGGTGCACTC
+CCGCACGCACACGGGCGAGAAGCCCTATCACTGCGACATCTGCT
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/extract_genomic_dna_out3.interval
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extract_genomic_dna_out3.interval Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,10 @@
+chr1 147962192 147962580 CCDS989.1_cds_0_0_chr1_147962193_r 0 - ACTTGATCCTGCTCCCTCGGTGTCTGCATTGACTCCTCATGCTGGGACTGGACCCGTCAACCCCCCTGCTCGCTGCTCACGTACCTTCATCACTTTTAGTGATGATGCAACTTTCGAGGAATGGTTCCCCCAAGGGCGGCCCCCAAAAGTCCCTGTTCGTGAGGTCTGTCCAGTGACCCATCGTCCAGCCCTATACCGGGACCCTGTTACAGACATACCCTATGCCACTGCTCGAGCCTTCAAGATCATTCGTGAGGCTTACAAGAAGTACATTACTGCCCATGGACTGCCGCCCACTGCCTCAGCCCTGGGCCCCGGCCCGCCACCTCCTGAGCCCCTCCCTGGCTCTGGGCCCCGAGCCTTGCGCCAGAAAATTGTCATTAAATGA
+chr1 147984545 147984630 CCDS990.1_cds_0_0_chr1_147984546_f 0 + ATGGCGTCGGCCTCCTCCGGGCCGTCGTCTTCGGTCGGTTTTTCATCCTTTGATCCCGCGGTCCCTTCCTGTACCTTGTCCTCAG
+chr1 148078400 148078582 CCDS993.1_cds_0_0_chr1_148078401_r 0 - GTTCTCAGCTTCCTTGCTTCCATGGCTCCAGCACCATTCGAAACCTCAAAGAGAGGTTCCACATGAGCATGACTGAGGAGCAGCTGCAGCTGCTGGTGGAGCAGATGGTGGATGGCAGTATGCGGTCTATCACCACCAAACTCTATGACGGCTTCCAGTACCTCACCAACGGCATCATGTGA
+chr1 148185136 148185276 CCDS996.1_cds_0_0_chr1_148185137_f 0 + ATGGAAGCGTTTTTGGGGTCGCGGTCCGGACTTTGGGCGGGGGGTCCGGCCCCAGGACAGTTTTACCGCATTCCGTCCACTCCCGATTCCTTCATGGATCCGGCGTCTGCACTTTACAGAGGTCCAATCACGCGGACCCA
+chr10 55251623 55253124 CCDS7248.1_cds_0_0_chr10_55251624_r 0 - TCTTTTCCTTCTCTACCATTTTCAACAAAGCAGGGGAAATAACTCAGTCTCAGAAGACAGGAAACATCAACAAGTTGTGATGCCCTTTTCTTCCAATACTATTGAGGCTCACAAGTCAGCTCATGTAGACGGATCACTTAAGAGCAACAAACTGAAGTCTGCAAGAAAATTCACATTTCTATCTGATGAGGATGACTTAAGTGCCCATAATCCCCTTTATAAGGAAAACATAAGTCAAGTATCAACAAATTCAGACATTTCACAGAGAACAGATTTTGTAGACCCATTTTCACCCAAAATACAAGCCAAGAGTAAGTCTCTGAGGGGCCCAAGAGAAAAGATTCAGAGGCTGTGGAGTCAGTCAGTCAGCTTACCCAGGAGGCTGATGAGGAAAGTTCCAAATAGACCAGAGATCATAGATCTGCAGCAGTGGCAAGGCACCAGGCAGAAAGCTGAAAATGAAAACACTGGAATCTGTACAAACAAAAGAGGTAGCAGCAATCCATTGCTTACAACTGAAGAGGCAAATTTGACAGAGAAAGAGGAAATAAGGCAAGGTGAAACACTGATGATAGAAGGAACAGAACAGTTGAAATCTCTCTCTTCAGACTCTTCATTTTGCTTTCCCAGGCCTCACTTCTCATTCTCCACTTTGCCAACTGTTTCAAGAACTGTGGAACTCAAATCAGAACCTAATGTCATCAGTTCTCCTGCTGAGTGTTCCTTGGAACTTTCTCCTTCAAGGCCTTGTGTTTTACATTCTTCACTCTCTAGGAGAGAGACACCTATTTGTATGTTACCTATTGAAACCGAAAGAAATATTTTTGAAAATTTTGCCCATCCACCAAACATCTCTCCTTCTGCCTGtccccttccccctcctcctcctatttctcctccttctcctcctcctgctcctgctcctcttgctcctcctcctgacatttctcctttttctcttttttgtcctcctccctctcctccttctatccctcttcctcttcctcctcctACATTTTTTCCACTTTCCGTTTCAACGTCTGGTCCCCCAACAccacctcttctacctccatttccaactcctcttcctccaccacctccttctattccttgccctccacctccttcAGCTTCATTTCTGTCCACAGAGTGTGTCTGTATAACAGGTGTTAAATGCACGACCAACTTGATGCCTGCCGAGAAAATTAAGTCCTCTATGACACAGCTATCAACAACGACAGTGTGTAAAACAGACCCTCAGAGAGAACCAAAAGGCATCCTCAGACACGTTAAAAACTTAGCAGAACTTGAAAAATCAGTAGCTAACATGTACAGTCAAATAGAAAAAAACTATCTACGCACAAATGTTTCAGAACTTCAAACTATGTGCCCTTCAGAAGTAACAAATATGGAAATCACATCTGAACAAAACAAGGGGAGTTTGAACAATATTGTCGAGGGAACTGAAAAACAATCTCACAGTCAATCTACTTCACTGTAA
+chr11 116124407 116124501 CCDS8374.1_cds_0_0_chr11_116124408_r 0 - ATCCAATGGATTTGAACAGAAGCGCTTTGCCAGGCTTGCCAGCAAGAAGGCAGTGGAGGAACTTGCCTACAAATGGAGTGTTGAGGATATGTAA
+chr11 116206508 116206563 CCDS8377.1_cds_0_0_chr11_116206509_f 0 + ATGCAGCCCCGGGTACTCCTTGTTGTTGCCCTCCTGGCGCTCCTGGCCTCTGCCC
+chr11 116211733 116212337 CCDS8378.1_cds_0_0_chr11_116211734_r 0 - CCTAAAGCTCCTTGACAACTGGGACAGCGTGACCTCCACCTTCAGCAAGCTGCGCGAACAGCTCGGCCCTGTGACCCAGGAGTTCTGGGATAACCTGGAAAAGGAGACAGAGGGCCTGAGGCAGGAGATGAGCAAGGATCTGGAGGAGGTGAAGGCCAAGGTGCAGCCCTACCTGGACGACTTCCAGAAGAAGTGGCAGGAGGAGATGGAGCTCTACCGCCAGAAGGTGGAGCCGCTGCGCGCAGAGCTCCAAGAGGGCGCGCGCCAGAAGCTGCACGAGCTGCAAGAGAAGCTGAGCCCACTGGGCGAGGAGATGCGCGACCGCGCGCGCGCCCATGTGGACGCGCTGCGCACGCATCTGGCCCCCTACAGCGACGAGCTGCGCCAGCGCTTGGCCGCGCGCCTTGAGGCTCTCAAGGAGAACGGCGGCGCCAGACTGGCCGAGTACCACGCCAAGGCCACCGAGCATCTGAGCACGCTCAGCGAGAAGGCCAAGCCCGCGCTCGAGGACCTCCGCCAAGGCCTGCTGCCCGTGCTGGAGAGCTTCAAGGTCAGCTTCCTGAGCGCTCTCGAGGAGTACACTAAGAAGCTCAACACCCAGTGA
+chr11 1812377 1812407 CCDS7726.1_cds_0_0_chr11_1812378_f 0 + ATGCTCCACCTGCATGGCTGGCAAACCATG
+chr12 38440094 38440321 CCDS8736.1_cds_0_0_chr12_38440095_r 0 - GAGCTTTCTTCCTCTATGCTGGATTTGCTGCTGTGGGACTCCTTTTCATCTATGGCTGTCTTCCTGAGACCAAAGGCAAAAAATTAGAGGAAATTGAATCACTCTTTGACAACAGGCTATGTACATGTGGCACTTCAGATTCTGATGAAGGGAGATATATTGAATATATTCGGGTAAAGGGAAGTAACTATCATCTTTCTGACAATGATGCTTCTGATGTGGAATAA
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/extract_genomic_dna_out4.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extract_genomic_dna_out4.gff Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,10 @@
+chr10 Cufflinks transcript 62044837 62045189 1000 . . gene_id "CUFF.23531"; transcript_id "CUFF.23531.1"; FPKM "19.5178121606"; frac "1.000000"; conf_lo "9.264456"; conf_hi "29.771168"; cov "1.108611"; seq "AATTACAAGATCGACACACCAAGATAGGCAGATCCATGGTTGGTTTTACTTTGTAAATCTAAAAGTATGTTGGAAAACGATGCAATGAATTCTTATCCTTTTTCAAAATGAAGAATTTGTGATGGTTAGTGGACAGTTCAGAAGCCTCTCTGCAAGAAAGGGGGCGCTGAGAAGTGGTAAAAAAAGGAAGGAAGCACTCGGGCTTTGTCAGCAGGGTGGACCCTGGGGTCCACAGTGGGAACAGTCCCTTCTGGCCTCTACTCACTGACCAAACGCTTTACTAAAACTCCGCTTCTGGCCTCTGTTGCCACCTCCTGGTCGCTGTCCTCGGAAGTTTCTACTTCCTCCTCGCT";
+chr10 Cufflinks transcript 75372919 75373002 1000 . . gene_id "CUFF.24985"; transcript_id "CUFF.24985.1"; FPKM "124.4970510798"; frac "1.000000"; conf_lo "71.411330"; conf_hi "177.582772"; cov "7.071429"; seq "GCGTCTCGCAGCTTCTGCCCGTCGATCTCCATGTCGAGCCGGATGGGCACCAGCACCTCAGGCTGTGACGCATTCTCATGGATC";
+chr10 Cufflinks transcript 80362428 80363292 1000 - . gene_id "CUFF.26065"; transcript_id "CUFF.26065.1"; FPKM "43.6170921216"; frac "1.000000"; conf_lo "32.260169"; conf_hi "54.974016"; cov "2.477449"; seq "ATGACGGACAAGTGTTTCCGGAAGTGCATCGGGAAGCCCGGGGGCTCCTTGGATAACTCGGAGCAGGTGAGACATCTCGGGAACCCGGGGTGGTGAGGGGCGCGGGGTCAGGAGCGTCTAGGAGGTTGAGAGATGTGCGCGTGCGCGGCCTCTAGCCTTAGCTACTGAGGAAGTTGTGCGCGTGCGCGGGGTGAGGACCCGGCTTCTGTGCCTAGATCGGTGCAGCCTTCATGGGTGATCCTCGGGTCGTGTGACCGTCAGTCAGGGATCCCCCTCCACGCTTTGCAGAAATGCATCGCCATGTGCATGGACCGCTACATGGACGCCTGGAATACCGTGTCCCGCGCCTACAACTCTCGACTGCAGCGGGAACGAGCCAACATGTGACCGGGACCTGTGCCTCGGGACACCGTGCTTATGGTCTGAACTGTTTTCCCTGCCAGTTAGGGTGTCTCCTCCTAGCCGCCCTGAAGTCTGGCAGCATGGAGGGCTTGGGGATCGAGGCCTCTCCCCTGGGTTGCTGCGTCCAGCTCAATCTCAGAAGAGAGTGAGGACCCGACAGAGCACAGGGATCTGGCTGGCCCCACTGACCTGTGACCTCAGGAGAGCAGGCCAATAAATCGCTGCTGGGGCAGTAAAGCAGGCGTGTCACCTCACTGCTTCAGGTCCCTTCCCCTGAGTAGGCCCAGACCTCCCAGGGTATCTTTCCCCTTGGGGTCAGTGGGCTGCTGGCTCTCAGGGAATTCGGAGCATGATCTCAGGTGTTTGGTCATCCCGGGGAGACCAGCCGAGGTTAAGAAGCAAGGCTTCATGTagccttcacctatcatgcatgaggcccagggtgctgaccttaactctgaat";
+chr11 Cufflinks transcript 7904565 7904642 1000 . . gene_id "CUFF.33508"; transcript_id "CUFF.33508.1"; FPKM "61.6484988869"; frac "1.000000"; conf_lo "22.882428"; conf_hi "100.414569"; cov "3.501633"; seq "CATCTTCTATTTGAGCCTCCATCCAGGCACCTCTGAAACAAAGGTGCACTCACTGCATGTCCACTTGTCACAGGAGCC";
+chr11 Cufflinks exon 78140156 78140259 1000 . . gene_id "CUFF.43148"; transcript_id "CUFF.43148.1"; exon_number "1"; FPKM "54.8483511750"; frac "1.000000"; conf_lo "23.181641"; conf_hi "86.515061"; cov "3.115385"; seq "CTGCTTGCTAATTTTCTCTCTTGGGATCAGGGGGACGTGAACTCCAGCCCTGACTCGTGCTCCTTATGCTCTGAGTACATAGCAAATAAATGAGAGCAAAACAC";
+chr11 Cufflinks exon 105616462 105616737 1000 . . gene_id "CUFF.48385"; transcript_id "CUFF.48385.1"; exon_number "1"; FPKM "18.9452034252"; frac "1.000000"; conf_lo "7.520816"; conf_hi "30.369591"; cov "1.076087"; seq "TAGGTGTAATAGTGGAAAACAATAGTTTTTAAACTTCAGAGTCCAGGGCTGTAACTCAGTAGTAACAGTGTTCTCTAAGTATGTTATTCTTCCTCTACATGCTGAAATTTTTCATATTTGGAGCATTCACTGTTCCATGTATCAGTAAATTATATTGTGAGCTGTCATCATATCTAAGCACCATATTGAATATTTTTCATGATTAAAATTTGTTGAAACAACAATTCTATGACCGAAAAAAGCAAGGCTTTGTAAATAACATGTTTGTTACTAGTA";
+chr12 Cufflinks exon 30701762 30702509 1000 . . gene_id "CUFF.53897"; transcript_id "CUFF.53897.1"; exon_number "1"; FPKM "48.9333329111"; frac "1.000000"; conf_lo "37.780391"; conf_hi "60.086275"; cov "2.779412"; seq "TGTGGAGTGTACTTATATGATCCCTATGCTGATAGGATTACCTTCCTAGACATAGCTAGACGCAAAGCCACATGTGTAAGGCTGCTGAGCAAAGACAGCATCCCAGCATGGGTGTGTTCACGGTGGATTCACCACGTTGCATATGTAAAGTGGTCCCCTTGGCTTACCCTTCACTTTGCTCATGAGATTCAGAAGCTGGTGGTCCAGCAGGGGTGAGCATTTGTGAAATAGTAAGCTGAACTTAGTGGTGAGATTTCAGAACAGACTTCTGTGAAGTAAGAGATGTAACCATGCATCTAAAATCAGATGGCCGTGTAACTGCTCGGGCATAGAAATGGTGGGAGAACCTGTCCTGGGTACCTGGCATTTCACATGAGCCCAGGGATATGTCTTGTGCCAAGGCACACAAGTGTCCATGGACTTGGACAGGTGCCAAGGGTTTTTGTCTCTGTTCCTATGTGGGAGGCTGGCTGTGATTTACATTAATTTCTGTATTTCAAACGAAGATGTCTGCAGATCTCCATTTTGATGTTACAGCCTCATTGCCCAGGCAGTGGGCAGTGCCCAGACACCCTTTCTGACTAGCCACTGCATTGGGCTTCTGTGATTCAAAGTAGTGTATATATTTATTTACTTCTCTGACTGTGGCCAACAGCCAAATGCCATTTTATGTTCCTTGTATTCAGTCCATTACCAAAGAGGTGTTTGCACTTTGTAATGATACCTTTCAGTTCAAATAAAAGGACCA";
+chr13 Cufflinks exon 49159496 49159569 1000 . . gene_id "CUFF.67788"; transcript_id "CUFF.67788.1"; exon_number "1"; FPKM "44.9657653777"; frac "1.000000"; conf_lo "10.974842"; conf_hi "78.956689"; cov "2.554054"; seq "ttttcttttggattacttgatttttttttatttgatcttatttatgatgattttgagtacatttttgaacagtt";
+chr13 Cufflinks transcript 100200304 100200330 1000 . . gene_id "CUFF.73108"; transcript_id "CUFF.73108.1"; FPKM "123.2395051093"; frac "1.000000"; conf_lo "30.079196"; conf_hi "216.399814"; cov "7.000000"; seq "TCTCATATGAATAGCCACCCTCTTCTG";
+chr14 Cufflinks transcript 31949103 31949152 1000 . . gene_id "CUFF.77316"; transcript_id "CUFF.77316.1"; FPKM "85.5634278330"; frac "1.000000"; conf_lo "28.521143"; conf_hi "142.605713"; cov "4.860000"; seq "GGATGCTATCCGCGATGTGCATGTAAAGGGCCTCATGTACCAGTGGATCG";
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/extract_genomic_dna_out5.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extract_genomic_dna_out5.fasta Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,31 @@
+>mm9_chr10_62044837_62045189_+
+AATTACAAGATCGACACACCAAGATAGGCAGATCCATGGTTGGTTTTACT
+TTGTAAATCTAAAAGTATGTTGGAAAACGATGCAATGAATTCTTATCCTT
+TTTCAAAATGAAGAATTTGTGATGGTTAGTGGACAGTTCAGAAGCCTCTC
+TGCAAGAAAGGGGGCGCTGAGAAGTGGTAAAAAAAGGAAGGAAGCACTCG
+GGCTTTGTCAGCAGGGTGGACCCTGGGGTCCACAGTGGGAACAGTCCCTT
+CTGGCCTCTACTCACTGACCAAACGCTTTACTAAAACTCCGCTTCTGGCC
+TCTGTTGCCACCTCCTGGTCGCTGTCCTCGGAAGTTTCTACTTCCTCCTC
+GCT
+>mm9_chr10_75372919_75373002_+
+GCGTCTCGCAGCTTCTGCCCGTCGATCTCCATGTCGAGCCGGATGGGCAC
+CAGCACCTCAGGCTGTGACGCATTCTCATGGATC
+>mm9_chr10_80362428_80363292_-
+ATGACGGACAAGTGTTTCCGGAAGTGCATCGGGAAGCCCGGGGGCTCCTT
+GGATAACTCGGAGCAGGTGAGACATCTCGGGAACCCGGGGTGGTGAGGGG
+CGCGGGGTCAGGAGCGTCTAGGAGGTTGAGAGATGTGCGCGTGCGCGGCC
+TCTAGCCTTAGCTACTGAGGAAGTTGTGCGCGTGCGCGGGGTGAGGACCC
+GGCTTCTGTGCCTAGATCGGTGCAGCCTTCATGGGTGATCCTCGGGTCGT
+GTGACCGTCAGTCAGGGATCCCCCTCCACGCTTTGCAGAAATGCATCGCC
+ATGTGCATGGACCGCTACATGGACGCCTGGAATACCGTGTCCCGCGCCTA
+CAACTCTCGACTGCAGCGGGAACGAGCCAACATGTGACCGGGACCTGTGC
+CTCGGGACACCGTGCTTATGGTCTGAACTGTTTTCCCTGCCAGTTAGGGT
+GTCTCCTCCTAGCCGCCCTGAAGTCTGGCAGCATGGAGGGCTTGGGGATC
+GAGGCCTCTCCCCTGGGTTGCTGCGTCCAGCTCAATCTCAGAAGAGAGTG
+AGGACCCGACAGAGCACAGGGATCTGGCTGGCCCCACTGACCTGTGACCT
+CAGGAGAGCAGGCCAATAAATCGCTGCTGGGGCAGTAAAGCAGGCGTGTC
+ACCTCACTGCTTCAGGTCCCTTCCCCTGAGTAGGCCCAGACCTCCCAGGG
+TATCTTTCCCCTTGGGGTCAGTGGGCTGCTGGCTCTCAGGGAATTCGGAG
+CATGATCTCAGGTGTTTGGTCATCCCGGGGAGACCAGCCGAGGTTAAGAA
+GCAAGGCTTCATGTagccttcacctatcatgcatgaggcccagggtgctg
+accttaactctgaat
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/extract_genomic_dna_out6.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extract_genomic_dna_out6.fasta Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,20 @@
+>mm9_test_chromosome_53_550_+
+TACTATCTGACTAGACTGGAGGCGCTTGCGACTGAGCTAGGACGTGCCAC
+TACGGGGATGACGACTAGGACTACGGACGGACTTAGAGCGTCAGATGCAG
+CGACTGGACTATTTAGGACGATCGGACTGAGGAGGGCAGTAGGACGCTAC
+GTATTTGGCGCGCGGCGCTACGGCTGAGCGTCGAGCTTGCGATACGCCGT
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAC
+TATTACTTTATTATCTTACTCGGACGTAGACGGATCGGCAACGGGACTGT
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGTT
+TTCTACTTGAGACTGGGATCGAGGCGGACTTTTTAGGACGGGACTTGC
+>mm9_test_chromosome_53_250_+
+TACTATCTGACTAGACTGGAGGCGCTTGCGACTGAGCTAGGACGTGCCAC
+TACGGGGATGACGACTAGGACTACGGACGGACTTAGAGCGTCAGATGCAG
+CGACTGGACTATTTAGGACGATCGGACTGAGGAGGGCAGTAGGACGCTAC
+GTATTTGGCGCGCGGCGCTACGGCTGAGCGTCGAGCTTGCGATACGCC
+>mm9_test_chromosome_351_400_+
+ACTATTACTTTATTATCTTACTCGGACGTAGACGGATCGGCAACGGGACT
+>mm9_test_chromosome_501_550_+
+TTTTCTACTTGAGACTGGGATCGAGGCGGACTTTTTAGGACGGGACTTGC
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/extract_genomic_dna_out7.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extract_genomic_dna_out7.fasta Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,17 @@
+>mm9_test_chromosome_53_550_+
+TACTATCTGACTAGACTGGAGGCGCTTGCGACTGAGCTAGGACGTGCCAC
+TACGGGGATGACGACTAGGACTACGGACGGACTTAGAGCGTCAGATGCAG
+CGACTGGACTATTTAGGACGATCGGACTGAGGAGGGCAGTAGGACGCTAC
+GTATTTGGCGCGCGGCGCTACGGCTGAGCGTCGAGCTTGCGATACGCCGT
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAC
+TATTACTTTATTATCTTACTCGGACGTAGACGGATCGGCAACGGGACTGT
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGTT
+TTCTACTTGAGACTGGGATCGAGGCGGACTTTTTAGGACGGGACTTGCTA
+CTATCTGACTAGACTGGAGGCGCTTGCGACTGAGCTAGGACGTGCCACTA
+CGGGGATGACGACTAGGACTACGGACGGACTTAGAGCGTCAGATGCAGCG
+ACTGGACTATTTAGGACGATCGGACTGAGGAGGGCAGTAGGACGCTACGT
+ATTTGGCGCGCGGCGCTACGGCTGAGCGTCGAGCTTGCGATACGCCACTA
+TTACTTTATTATCTTACTCGGACGTAGACGGATCGGCAACGGGACTTTTT
+CTACTTGAGACTGGGATCGAGGCGGACTTTTTAGGACGGGACTTGC
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/gff_filter_by_attribute_out1.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gff_filter_by_attribute_out1.gff Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,20 @@
+chr10 Cufflinks transcript 62044837 62045189 1000 . . gene_id "CUFF.23531"; transcript_id "CUFF.23531.1"; FPKM "19.5178121606"; frac "1.000000"; conf_lo "9.264456"; conf_hi "29.771168"; cov "1.108611";
+chr10 Cufflinks transcript 75372919 75373002 1000 . . gene_id "CUFF.24985"; transcript_id "CUFF.24985.1"; FPKM "124.4970510798"; frac "1.000000"; conf_lo "71.411330"; conf_hi "177.582772"; cov "7.071429";
+chr10 Cufflinks transcript 80362428 80363292 1000 - . gene_id "CUFF.26065"; transcript_id "CUFF.26065.1"; FPKM "43.6170921216"; frac "1.000000"; conf_lo "32.260169"; conf_hi "54.974016"; cov "2.477449";
+chr11 Cufflinks transcript 7904565 7904642 1000 . . gene_id "CUFF.33508"; transcript_id "CUFF.33508.1"; FPKM "61.6484988869"; frac "1.000000"; conf_lo "22.882428"; conf_hi "100.414569"; cov "3.501633";
+chr11 Cufflinks exon 78140156 78140259 1000 . . gene_id "CUFF.43148"; transcript_id "CUFF.43148.1"; exon_number "1"; FPKM "54.8483511750"; frac "1.000000"; conf_lo "23.181641"; conf_hi "86.515061"; cov "3.115385";
+chr11 Cufflinks exon 105616462 105616737 1000 . . gene_id "CUFF.48385"; transcript_id "CUFF.48385.1"; exon_number "1"; FPKM "18.9452034252"; frac "1.000000"; conf_lo "7.520816"; conf_hi "30.369591"; cov "1.076087";
+chr12 Cufflinks exon 30701762 30702509 1000 . . gene_id "CUFF.53897"; transcript_id "CUFF.53897.1"; exon_number "1"; FPKM "48.9333329111"; frac "1.000000"; conf_lo "37.780391"; conf_hi "60.086275"; cov "2.779412";
+chr13 Cufflinks exon 49159496 49159569 1000 . . gene_id "CUFF.67788"; transcript_id "CUFF.67788.1"; exon_number "1"; FPKM "44.9657653777"; frac "1.000000"; conf_lo "10.974842"; conf_hi "78.956689"; cov "2.554054";
+chr13 Cufflinks transcript 100200304 100200330 1000 . . gene_id "CUFF.73108"; transcript_id "CUFF.73108.1"; FPKM "123.2395051093"; frac "1.000000"; conf_lo "30.079196"; conf_hi "216.399814"; cov "7.000000";
+chr14 Cufflinks transcript 31949103 31949152 1000 . . gene_id "CUFF.77316"; transcript_id "CUFF.77316.1"; FPKM "85.5634278330"; frac "1.000000"; conf_lo "28.521143"; conf_hi "142.605713"; cov "4.860000";
+chr14 Cufflinks exon 67604227 67604668 1000 . . gene_id "CUFF.81446"; transcript_id "CUFF.81446.1"; exon_number "1"; FPKM "123.6776546104"; frac "1.000000"; conf_lo "100.611653"; conf_hi "146.743656"; cov "7.024887";
+chr14 Cufflinks exon 75165582 75165744 1000 . . gene_id "CUFF.82088"; transcript_id "CUFF.82088.1"; exon_number "1"; FPKM "20.4139057543"; frac "1.000000"; conf_lo "4.982443"; conf_hi "35.845368"; cov "1.159509";
+chr16 Cufflinks transcript 57154027 57154067 1000 . . gene_id "CUFF.103364"; transcript_id "CUFF.103364.1"; FPKM "162.3154457537"; frac "1.000000"; conf_lo "75.554191"; conf_hi "249.076701"; cov "9.219512";
+chr16 Cufflinks exon 74862302 74862560 1000 . . gene_id "CUFF.105450"; transcript_id "CUFF.105450.1"; exon_number "1"; FPKM "11.0120241741"; frac "1.000000"; conf_lo "2.020744"; conf_hi "20.003304"; cov "0.625483";
+chr16 Cufflinks transcript 98168779 98168914 1000 . . gene_id "CUFF.107834"; transcript_id "CUFF.107834.1"; FPKM "24.4666664555"; frac "1.000000"; conf_lo "5.971605"; conf_hi "42.961728"; cov "1.389706";
+chr17 Cufflinks exon 8483212 8483268 1000 . . gene_id "CUFF.108498"; transcript_id "CUFF.108498.1"; exon_number "1"; FPKM "50.0370923000"; frac "1.000000"; conf_lo "9.181978"; conf_hi "90.892207"; cov "2.842105";
+chr17 Cufflinks exon 30355791 30355913 1000 . . gene_id "CUFF.111759"; transcript_id "CUFF.111759.1"; exon_number "1"; FPKM "19.3232673516"; frac "1.000000"; conf_lo "2.040012"; conf_hi "36.606523"; cov "1.097561";
+chr18 Cufflinks transcript 39571718 39571880 1000 . . gene_id "CUFF.123569"; transcript_id "CUFF.123569.1"; FPKM "20.4139057543"; frac "1.000000"; conf_lo "4.982443"; conf_hi "35.845368"; cov "1.159509";
+chr19 Cufflinks exon 17633088 17633203 1000 . . gene_id "CUFF.131333"; transcript_id "CUFF.131333.1"; exon_number "1"; FPKM "20.4893265884"; frac "1.000000"; conf_lo "2.163116"; conf_hi "38.815537"; cov "1.163793";
+chr19 Cufflinks transcript 41997624 41997859 1000 . . gene_id "CUFF.133569"; transcript_id "CUFF.133569.1"; FPKM "28.1988698132"; frac "1.000000"; conf_lo "13.125940"; conf_hi "43.271800"; cov "1.601695";
b
diff -r 000000000000 -r 8dd8e89c0603 test-data/tophat_in1.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tophat_in1.fasta Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,14 @@
+>test_chromosome
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+ACTACTATCTGACTAGACTGGAGGCGCTTGCGACTGAGCTAGGACGTGCC
+ACTACGGGGATGACGACTAGGACTACGGACGGACTTAGAGCGTCAGATGC
+AGCGACTGGACTATTTAGGACGATCGGACTGAGGAGGGCAGTAGGACGCT
+ACGTATTTGGCGCGCGGCGCTACGGCTGAGCGTCGAGCTTGCGATACGCC
+GTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG
+ACTATTACTTTATTATCTTACTCGGACGTAGACGGATCGGCAACGGGACT
+GTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG
+TTTTCTACTTGAGACTGGGATCGAGGCGGACTTTTTAGGACGGGACTTGC
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
b
diff -r 000000000000 -r 8dd8e89c0603 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,6 @@
+<tables>
+    <table name="twobit" comment_char="#">
+        <columns>dbkey, value</columns>
+        <file path="tool-data/twobit.loc" />
+    </table>
+</tables>
b
diff -r 000000000000 -r 8dd8e89c0603 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="bx-python" version="0.7.1">
+        <repository changeset_revision="7ce9cf37130f" name="package_bx_python_0_7" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="faToTwoBit" version="35x1">
+        <repository changeset_revision="46615329ea72" name="package_fatotwobit_35x1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>
b
diff -r 000000000000 -r 8dd8e89c0603 twobit.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/twobit.loc.sample Tue Jan 19 09:34:23 2016 -0500
b
@@ -0,0 +1,26 @@
+#This is a sample file distributed with Galaxy that is used by some
+#tools.  The twobit.loc file has this format (white space characters 
+#are TAB characters):
+#
+#<Build>    <FullPathToFile>
+#
+#So, for example, if you had droPer1 twobit files stored in 
+#/depot/data2/galaxy/droPer1/, then the twobit.loc entry 
+#would look like this:
+#
+#droPer1    /depot/data2/galaxy/droPer1/droPer1.2bit
+#
+#and your /depot/data2/galaxy/droPer1/ directory would 
+#contain all of your twobit files (e.g.):
+#
+#-rw-rw-r--   1 nate   galaxy 48972650 2007-05-04 11:27 droPer1.2bit
+#...etc...
+#
+#Your twobit.loc file should include an entry per line for each twobit 
+#file you have stored.  For example:
+#
+#droPer1    /depot/data2/galaxy/droPer1/droPer1.2bit
+#apiMel2    /depot/data2/galaxy/apiMel2/apiMel2.2bit
+#droAna1    /depot/data2/galaxy/droAna1/droAna1.2bit
+#droAna2    /depot/data2/galaxy/droAna2/droAna2.2bit
+#...etc...