# HG changeset patch # User drosofff # Date 1495579042 14400 # Node ID 1435d142041bb6f8519793375f94d7a96cb745ff planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit d5ebd581fa3a22ca61ce07a31c01bb70610fbcf5 diff -r 000000000000 -r 1435d142041b RepEnrich.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RepEnrich.py Tue May 23 18:37:22 2017 -0400 @@ -0,0 +1,382 @@ +#!/usr/bin/env python +import argparse +import csv +import numpy +import os +import shlex +import shutil +import subprocess +import sys + +parser = argparse.ArgumentParser(description='Part II: Conducting the alignments to the psuedogenomes. Before doing this step you will require 1) a bamfile of the unique alignments with index 2) a fastq file of the reads mapping to more than one location. These files can be obtained using the following bowtie options [EXAMPLE: bowtie -S -m 1 --max multimap.fastq mm9 mate1_reads.fastq] Once you have the unique alignment bamfile and the reads mapping to more than one location in a fastq file you can run this step. EXAMPLE: python master_output.py /users/nneretti/data/annotation/hg19/hg19_repeatmasker.txt /users/nneretti/datasets/repeatmapping/POL3/Pol3_human/HeLa_InputChIPseq_Rep1 HeLa_InputChIPseq_Rep1 /users/nneretti/data/annotation/hg19/setup_folder HeLa_InputChIPseq_Rep1_multimap.fastq HeLa_InputChIPseq_Rep1.bam') +parser.add_argument('--version', action='version', version='%(prog)s 0.1') +parser.add_argument('annotation_file', action= 'store', metavar='annotation_file', help='List RepeatMasker.org annotation file for your organism. The file may be downloaded from the RepeatMasker.org website. Example: /data/annotation/hg19/hg19_repeatmasker.txt') +parser.add_argument('outputfolder', action= 'store', metavar='outputfolder', help='List folder to contain results. Example: /outputfolder') +parser.add_argument('outputprefix', action= 'store', metavar='outputprefix', help='Enter prefix name for data. Example: HeLa_InputChIPseq_Rep1') +parser.add_argument('setup_folder', action= 'store', metavar='setup_folder', help='List folder that contains the repeat element psuedogenomes. Example /data/annotation/hg19/setup_folder') +parser.add_argument('fastqfile', action= 'store', metavar='fastqfile', help='Enter file for the fastq reads that map to multiple locations. Example /data/multimap.fastq') +parser.add_argument('alignment_bam', action= 'store', metavar='alignment_bam', help='Enter bamfile output for reads that map uniquely. Example /bamfiles/old.bam') +parser.add_argument('--pairedend', action= 'store', dest='pairedend', default= 'FALSE', help='Designate this option for paired-end sequencing. Default FALSE change to TRUE') +parser.add_argument('--collapserepeat', action= 'store', dest='collapserepeat', metavar='collapserepeat', default= 'Simple_repeat', help='Designate this option to generate a collapsed repeat type. Uncollapsed output is generated in addition to collapsed repeat type. Simple_repeat is default to simplify downstream analysis. You can change the default to another repeat name to collapse a seperate specific repeat instead or if the name of Simple_repeat is different for your organism. Default Simple_repeat') +parser.add_argument('--fastqfile2', action= 'store', dest='fastqfile2', metavar='fastqfile2', default= 'none', help='Enter fastqfile2 when using paired-end option. Default none') +parser.add_argument('--cpus', action= 'store', dest='cpus', metavar='cpus', default= "1", type=int, help='Enter available cpus per node. The more cpus the faster RepEnrich performs. RepEnrich is designed to only work on one node. Default: "1"') +parser.add_argument('--allcountmethod', action= 'store', dest='allcountmethod', metavar='allcountmethod', default= "FALSE", help='By default the pipeline only outputs the fraction count method. Consdidered to be the best way to count multimapped reads. Changing this option will include the unique count method, a conservative count, and the total count method, a liberal counting strategy. Our evaluation of simulated data indicated fraction counting is best. Default = FALSE, change to TRUE') +parser.add_argument('--is_bed', action= 'store', dest='is_bed', metavar='is_bed', default= 'FALSE', help='Is the annotation file a bed file. This is also a compatible format. The file needs to be a tab seperated bed with optional fields. Ex. format chr\tstart\tend\tName_element\tclass\tfamily. The class and family should identical to name_element if not applicable. Default FALSE change to TRUE') +args = parser.parse_args() + +# parameters +annotation_file = args.annotation_file +outputfolder = args.outputfolder +outputfile_prefix = args.outputprefix +setup_folder = args.setup_folder +repeat_bed = setup_folder + os.path.sep + 'repnames.bed' +unique_mapper_bam = args.alignment_bam +fastqfile_1 = args.fastqfile +fastqfile_2 = args.fastqfile2 +cpus = args.cpus +b_opt = "-k1 -p " +str(1) +" --quiet" +simple_repeat = args.collapserepeat +paired_end = args.pairedend +allcountmethod = args.allcountmethod +is_bed = args.is_bed + +################################################################################ +# check that the programs we need are available +try: + subprocess.call(shlex.split("coverageBed -h"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) + subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) +except OSError: + print ("Error: Bowtie or BEDTools not loaded") + raise + +################################################################################ +# define a csv reader that reads space deliminated files +print ('Preparing for analysis using RepEnrich...') +csv.field_size_limit(sys.maxsize) +def import_text(filename, separator): + for line in csv.reader(open(filename), delimiter=separator, + skipinitialspace=True): + if line: + yield line + +################################################################################ +# build dictionaries to convert repclass and rep families' +if is_bed == "FALSE": + repeatclass = {} + repeatfamily = {} + fin = import_text(annotation_file, ' ') + x = 0 + for line in fin: + if x>2: + classfamily =[] + classfamily = line[10].split(os.path.sep) + line9 = line[9].replace("(","_").replace(")","_").replace("/","_") + repeatclass[line9] = classfamily[0] + if len(classfamily) == 2: + repeatfamily[line9] = classfamily[1] + else: + repeatfamily[line9] = classfamily[0] + x +=1 +if is_bed == "TRUE": + repeatclass = {} + repeatfamily = {} + fin = open(annotation_file, 'r') + for line in fin: + line=line.strip('\n') + line=line.split('\t') + theclass =line[4] + thefamily = line[5] + line3 = line[3].replace("(","_").replace(")","_").replace("/","_") + repeatclass[line3] = theclass + repeatfamily[line3] = thefamily +fin.close() + +################################################################################ +# build list of repeats initializing dictionaries for downstream analysis' +fin = import_text(setup_folder + os.path.sep + 'repgenomes_key.txt', '\t') +repeat_key ={} +rev_repeat_key ={} +repeat_list = [] +reptotalcounts = {} +classfractionalcounts = {} +familyfractionalcounts = {} +classtotalcounts = {} +familytotalcounts = {} +reptotalcounts_simple = {} +fractionalcounts = {} +i = 0 +for line in fin: + reptotalcounts[line[0]] = 0 + fractionalcounts[line[0]] = 0 + if line[0] in repeatclass: + classtotalcounts[repeatclass[line[0]]] = 0 + classfractionalcounts[repeatclass[line[0]]] = 0 + if line[0] in repeatfamily: + familytotalcounts[repeatfamily[line[0]]] = 0 + familyfractionalcounts[repeatfamily[line[0]]] = 0 + if line[0] in repeatfamily: + if repeatfamily[line[0]] == simple_repeat: + reptotalcounts_simple[simple_repeat] = 0 + else: + reptotalcounts_simple[line[0]] = 0 + repeat_list.append(line[0]) + repeat_key[line[0]] = int(line[1]) + rev_repeat_key[int(line[1])] = line[0] +fin.close() +################################################################################ +# map the repeats to the psuedogenomes: +if not os.path.exists(outputfolder): + os.mkdir(outputfolder) +################################################################################ +# Conduct the regions sorting +print ('Conducting region sorting on unique mapping reads....') +fileout= outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt' +with open(fileout, 'w') as stdout: + command = shlex.split("coverageBed -abam " +unique_mapper_bam+" -b " +setup_folder + os.path.sep + 'repnames.bed') + p = subprocess.Popen(command, stdout=stdout) +p.communicate() +stdout.close() +filein = open(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt','r') +counts = {} +sumofrepeatreads=0 +for line in filein: + line= line.split('\t') + if not str(repeat_key[line[3]]) in counts: + counts[str(repeat_key[line[3]])]=0 + counts[str(repeat_key[line[3]])]+=int(line[4]) + sumofrepeatreads+=int(line[4]) +print ('Identified ' + str(sumofrepeatreads) + 'unique reads that mapped to repeats.') +################################################################################ +if paired_end == 'TRUE': + if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') + if not os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair2_bowtie') + folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' + folder_pair2 = outputfolder + os.path.sep + 'pair2_bowtie' +################################################################################ + print ("Processing repeat psuedogenomes...") + ps = [] + psb= [] + ticker= 0 + for metagenome in repeat_list: + metagenomepath = setup_folder + os.path.sep + metagenome + file1=folder_pair1 + os.path.sep + metagenome + '.bowtie' + file2 =folder_pair2 + os.path.sep + metagenome + '.bowtie' + with open(file1, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_1) + p = subprocess.Popen(command,stdout=stdout) + with open(file2, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_2) + pp = subprocess.Popen(command,stdout=stdout) + ps.append(p) + ticker +=1 + psb.append(pp) + ticker +=1 + if ticker == cpus: + for p in ps: + p.communicate() + for p in psb: + p.communicate() + ticker = 0 + psb =[] + ps = [] + if len(ps) > 0: + for p in ps: + p.communicate() + stdout.close() + +################################################################################ +# combine the output from both read pairs: + print ('sorting and combining the output for both read pairs...') + if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') + sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' + for metagenome in repeat_list: + file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' + file2 = folder_pair2 + os.path.sep + metagenome + '.bowtie' + fileout= sorted_bowtie + os.path.sep + metagenome + '.bowtie' + with open(fileout, 'w') as stdout: + p1 = subprocess.Popen(['cat',file1,file2], stdout = subprocess.PIPE) + p2 = subprocess.Popen(['cut', '-f1',"-d "], stdin = p1.stdout, stdout = subprocess.PIPE) + p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin = p2.stdout, stdout = subprocess.PIPE) + p4 = subprocess.Popen(['sort'], stdin=p3.stdout, stdout = subprocess.PIPE) + p5 = subprocess.Popen(['uniq'], stdin=p4.stdout, stdout = stdout) + p5.communicate() + stdout.close() + print ('completed ...') +################################################################################ +if paired_end == 'FALSE': + if not os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'pair1_bowtie') + folder_pair1 = outputfolder + os.path.sep + 'pair1_bowtie' +################################################################################ + ps = [] + ticker= 0 + print ("Processing repeat psuedogenomes...") + for metagenome in repeat_list: + metagenomepath = setup_folder + os.path.sep + metagenome + file1=folder_pair1 + os.path.sep + metagenome + '.bowtie' + with open(file1, 'w') as stdout: + command = shlex.split("bowtie " + b_opt + " " + metagenomepath + " " + fastqfile_1) + p = subprocess.Popen(command,stdout=stdout) + ps.append(p) + ticker +=1 + if ticker == cpus: + for p in ps: + p.communicate() + ticker = 0 + ps = [] + if len(ps) > 0: + for p in ps: + p.communicate() + stdout.close() + +################################################################################ +# combine the output from both read pairs: + print ('Sorting and combining the output for both read pairs....') + if not os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + os.mkdir(outputfolder + os.path.sep + 'sorted_bowtie') + sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' + for metagenome in repeat_list: + file1 = folder_pair1 + os.path.sep + metagenome + '.bowtie' + fileout= sorted_bowtie + os.path.sep + metagenome + '.bowtie' + with open(fileout, 'w') as stdout: + p1 = subprocess.Popen(['cat',file1], stdout = subprocess.PIPE) + p2 = subprocess.Popen(['cut', '-f1'], stdin = p1.stdout, stdout = subprocess.PIPE) + p3 = subprocess.Popen(['cut', '-f1', "-d/"], stdin = p2.stdout, stdout = subprocess.PIPE) + p4 = subprocess.Popen(['sort'], stdin = p3.stdout,stdout = subprocess.PIPE) + p5 = subprocess.Popen(['uniq'], stdin = p4.stdout,stdout = stdout) + p5.communicate() + stdout.close() + print ('completed ...') + +################################################################################ +# build a file of repeat keys for all reads +print ('Writing and processing intermediate files...') +sorted_bowtie = outputfolder + os.path.sep + 'sorted_bowtie' +readid = {} +sumofrepeatreads=0 +for rep in repeat_list: + for data in import_text(sorted_bowtie + os.path.sep + rep + '.bowtie', '\t'): + readid[data[0]] = '' +for rep in repeat_list: + for data in import_text(sorted_bowtie + os.path.sep + rep + '.bowtie', '\t'): + readid[data[0]]+=str(repeat_key[rep]) + str(',') +for subfamilies in readid.values(): + if not subfamilies in counts: + counts[subfamilies]=0 + counts[subfamilies] +=1 + sumofrepeatreads+=1 +del readid +print ('Identified ' + str(sumofrepeatreads) + ' reads that mapped to repeats for unique and multimappers.') + +################################################################################ +print ("Conducting final calculations...") +# build a converter to numeric label for repeat and yield a combined list of repnames seperated by backslash +def convert(x): + x = x.strip(',') + x = x.split(',') + global repname + repname = "" + for i in x: + repname = repname + os.path.sep + rev_repeat_key[int(i)] +# building the total counts for repeat element enrichment... +for x in counts.keys(): + count= counts[x] + x = x.strip(',') + x = x.split(',') + for i in x: + reptotalcounts[rev_repeat_key[int(i)]] += int(count) +# building the fractional counts for repeat element enrichment... +for x in counts.keys(): + count= counts[x] + x = x.strip(',') + x = x.split(',') + splits = len(x) + for i in x: + fractionalcounts[rev_repeat_key[int(i)]] += float(numpy.divide(float(count),float(splits))) +# building categorized table of repeat element enrichment... +repcounts = {} +repcounts['other'] = 0 +for key in counts.keys(): + convert(key) + repcounts[repname] = counts[key] +# building the total counts for class enrichment... +for key in reptotalcounts.keys(): + classtotalcounts[repeatclass[key]] += reptotalcounts[key] +# building total counts for family enrichment... +for key in reptotalcounts.keys(): + familytotalcounts[repeatfamily[key]] += reptotalcounts[key] +# building unique counts table' +repcounts2 = {} +for rep in repeat_list: + if "/" +rep in repcounts: + repcounts2[rep] = repcounts["/" +rep] + else: + repcounts2[rep] = 0 +# building the fractionalcounts counts for class enrichment... +for key in fractionalcounts.keys(): + classfractionalcounts[repeatclass[key]] += fractionalcounts[key] +# building fractional counts for family enrichment... +for key in fractionalcounts.keys(): + familyfractionalcounts[repeatfamily[key]] += fractionalcounts[key] + +################################################################################ +print ('Writing final output and removing intermediate files...') +# print output to file of the categorized counts and total overlapping counts: +if allcountmethod == "TRUE": + fout1 = open(outputfolder + os.path.sep + outputfile_prefix + '_total_counts.txt' , 'w') + for key in reptotalcounts.keys(): + fout1.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(reptotalcounts[key]) + '\n') + fout2 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_total_counts.txt' , 'w') + for key in classtotalcounts.keys(): + fout2.write(str(key) + '\t' + str(classtotalcounts[key]) + '\n') + fout3 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_total_counts.txt' , 'w') + for key in familytotalcounts.keys(): + fout3.write(str(key) + '\t' + str(familytotalcounts[key]) + '\n') + fout4 = open(outputfolder + os.path.sep + outputfile_prefix + '_unique_counts.txt' , 'w') + for key in repcounts2.keys(): + fout4.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(repcounts2[key]) + '\n') + fout5 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_fraction_counts.txt' , 'w') + for key in classfractionalcounts.keys(): + fout5.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') + fout6 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_fraction_counts.txt' , 'w') + for key in familyfractionalcounts.keys(): + fout6.write(str(key) + '\t' + str(familyfractionalcounts[key]) + '\n') + fout7 = open(outputfolder + os.path.sep + outputfile_prefix + '_fraction_counts.txt' , 'w') + for key in fractionalcounts.keys(): + fout7.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) + '\n') + fout1.close() + fout2.close() + fout3.close() + fout4.close() + fout5.close() + fout6.close() + fout7.close() +else: + fout1 = open(outputfolder + os.path.sep + outputfile_prefix + '_class_fraction_counts.txt' , 'w') + for key in classfractionalcounts.keys(): + fout1.write(str(key) + '\t' + str(classfractionalcounts[key]) + '\n') + fout2 = open(outputfolder + os.path.sep + outputfile_prefix + '_family_fraction_counts.txt' , 'w') + for key in familyfractionalcounts.keys(): + fout2.write(str(key) + '\t' + str(familyfractionalcounts[key])+ '\n') + fout3 = open(outputfolder + os.path.sep + outputfile_prefix + '_fraction_counts.txt' , 'w') + for key in fractionalcounts.keys(): + fout3.write(str(key) + '\t' + repeatclass[key] + '\t' + repeatfamily[key] + '\t' + str(int(fractionalcounts[key])) + '\n') + fout1.close() + fout2.close() + fout3.close() + +################################################################################ +# Remove Large intermediate files +if os.path.exists(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt'): + os.remove(outputfolder + os.path.sep + outputfile_prefix + '_regionsorter.txt') +if os.path.exists(outputfolder + os.path.sep + 'pair1_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'pair1_bowtie') +if os.path.exists(outputfolder + os.path.sep + 'pair2_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'pair2_bowtie') +if os.path.exists(outputfolder + os.path.sep + 'sorted_bowtie'): + shutil.rmtree(outputfolder + os.path.sep + 'sorted_bowtie') + +print ("... Done") diff -r 000000000000 -r 1435d142041b RepEnrich_setup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RepEnrich_setup.py Tue May 23 18:37:22 2017 -0400 @@ -0,0 +1,194 @@ +#!/usr/bin/env python +import argparse +import csv +import os +import shlex +import subprocess +import sys +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC + +parser = argparse.ArgumentParser(description='Part I: Prepartion of repetive element psuedogenomes and repetive element bamfiles. This script prepares the annotation used by downstream applications to analyze for repetitive element enrichment. For this script to run properly bowtie must be loaded. The repeat element psuedogenomes are prepared in order to analyze reads that map to multiple locations of the genome. The repeat element bamfiles are prepared in order to use a region sorter to analyze reads that map to a single location of the genome.You will 1) annotation_file: The repetitive element annotation file downloaded from RepeatMasker.org database for your organism of interest. 2) genomefasta: Your genome of interest in fasta format, 3)setup_folder: a folder to contain repeat element setup files command-line usage EXAMPLE: python master_setup.py /users/nneretti/data/annotation/mm9/mm9_repeatmasker.txt /users/nneretti/data/annotation/mm9/mm9.fa /users/nneretti/data/annotation/mm9/setup_folder', prog='getargs_genome_maker.py') +parser.add_argument('--version', action='version', version='%(prog)s 0.1') +parser.add_argument('annotation_file', action= 'store', metavar='annotation_file', help='List annotation file. The annotation file contains the repeat masker annotation for the genome of interest and may be downloaded at RepeatMasker.org Example /data/annotation/mm9/mm9.fa.out') +parser.add_argument('genomefasta', action= 'store', metavar='genomefasta', help='File name and path for genome of interest in fasta format. Example /data/annotation/mm9/mm9.fa') +parser.add_argument('setup_folder', action= 'store', metavar='setup_folder', help='List folder to contain bamfiles for repeats and repeat element psuedogenomes. Example /data/annotation/mm9/setup') +parser.add_argument('--nfragmentsfile1', action= 'store', dest='nfragmentsfile1', metavar='nfragmentsfile1', default='./repnames_nfragments.txt', help='Output location of a description file that saves the number of fragments processed per repname. Default ./repnames_nfragments.txt') +parser.add_argument('--gaplength', action= 'store', dest='gaplength', metavar='gaplength', default= '200', type=int, help='Length of the spacer used to build repeat psuedogeneomes. Default 200') +parser.add_argument('--flankinglength', action= 'store', dest='flankinglength', metavar='flankinglength', default= '25', type=int, help='Length of the flanking region adjacent to the repeat element that is used to build repeat psuedogeneomes. The flanking length should be set according to the length of your reads. Default 25') +parser.add_argument('--is_bed', action= 'store', dest='is_bed', metavar='is_bed', default= 'FALSE', help='Is the annotation file a bed file. This is also a compatible format. The file needs to be a tab seperated bed with optional fields. Ex. format chr\tstart\tend\tName_element\tclass\tfamily. The class and family should identical to name_element if not applicable. Default FALSE change to TRUE') +args = parser.parse_args() + +# parameters and paths specified in args_parse +gapl = args.gaplength +flankingl = args.flankinglength +annotation_file = args.annotation_file +genomefasta = args.genomefasta +setup_folder = args.setup_folder +nfragmentsfile1 = args.nfragmentsfile1 +is_bed = args.is_bed + +################################################################################ +# check that the programs we need are available +try: + subprocess.call(shlex.split("bowtie --version"), stdout=open(os.devnull, 'wb'), stderr=open(os.devnull, 'wb')) +except OSError: + print ("Error: Bowtie or BEDTools not loaded") + raise + +################################################################################ +# Define a text importer +csv.field_size_limit(sys.maxsize) +def import_text(filename, separator): + for line in csv.reader(open(os.path.realpath(filename)), delimiter=separator, + skipinitialspace=True): + if line: + yield line +# Make a setup folder +if not os.path.exists(setup_folder): + os.makedirs(setup_folder) + +################################################################################ +# load genome into dictionary +print ("loading genome...") +g = SeqIO.to_dict(SeqIO.parse(genomefasta, "fasta")) + +print ("Precomputing length of all chromosomes...") +idxgenome = {} +lgenome = {} +genome = {} +allchrs = g.keys() +k = 0 +for chr in allchrs: + genome[chr] = str(g[chr].seq) +# del g[chr] + lgenome[chr] = len(genome[chr]) + idxgenome[chr] = k + k = k + 1 +del g + +################################################################################ +# Build a bedfile of repeatcoordinates to use by RepEnrich region_sorter +if is_bed == "FALSE": + repeat_elements= [] + fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w') + fin = import_text(annotation_file, ' ') + x = 0 + rep_chr = {} + rep_start = {} + rep_end = {} + x = 0 + for line in fin: + if x>2: + line9 = line[9].replace("(","_").replace(")","_").replace("/","_") + repname = line9 + if not repname in repeat_elements: + repeat_elements.append(repname) + repchr = line[4] + repstart = int(line[5]) + repend = int(line[6]) +# print >> fout, str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) + fout.write(str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname)+ '\n') +# if rep_chr.has_key(repname): + if repname in rep_chr: + rep_chr[repname].append(repchr) + rep_start[repname].append(int(repstart)) + rep_end[repname].append(int(repend)) + else: + rep_chr[repname] = [repchr] + rep_start[repname] = [int(repstart)] + rep_end[repname] = [int(repend)] + x +=1 +if is_bed == "TRUE": + repeat_elements= [] + fout = open(os.path.realpath(setup_folder + os.path.sep + 'repnames.bed'), 'w') + fin = open(os.path.realpath(annotation_file), 'r') + x =0 + rep_chr = {} + rep_start = {} + rep_end = {} + x =0 + for line in fin: + line=line.strip('\n') + line=line.split('\t') + line3 = line[3].replace("(","_").replace(")","_").replace("/","_") + repname = line3 + if not repname in repeat_elements: + repeat_elements.append(repname) + repchr = line[0] + repstart = int(line[1]) + repend = int(line[2]) +# print >> fout, str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) + fout.write(str(repchr) + '\t'+str(repstart)+ '\t'+str(repend)+ '\t'+str(repname) + '\n') +# if rep_chr.has_key(repname): + if repname in rep_chr: + rep_chr[repname].append(repchr) + rep_start[repname].append(int(repstart)) + rep_end[repname].append(int(repend)) + else: + rep_chr[repname] = [repchr] + rep_start[repname] = [int(repstart)] + rep_end[repname] = [int(repend)] + +fin.close() +fout.close() +repeat_elements = sorted(repeat_elements) +print ("Writing a key for all repeats...") +#print to fout the binary key that contains each repeat type with the associated binary number; sort the binary key: +fout = open(os.path.realpath(setup_folder + os.path.sep + 'repgenomes_key.txt'), 'w') +x = 0 +for repeat in repeat_elements: +# print >> fout, str(repeat) + '\t' + str(x) + fout.write(str(repeat) + '\t' + str(x) + '\n') + x +=1 +fout.close() +################################################################################ +# generate spacer for psuedogenomes +spacer = "" +for i in range(gapl): + spacer = spacer + "N" + +# save file with number of fragments processed per repname +print ("Saving number of fragments processed per repname to " + nfragmentsfile1) +fout1 = open(os.path.realpath(nfragmentsfile1),"w") +for repname in rep_chr.keys(): + rep_chr_current = rep_chr[repname] +# print >>fout1, str(len(rep_chr[repname])) + "\t" + repname + fout1.write(str(len(rep_chr[repname])) + "\t" + repname + '\n') +fout1.close() + +# generate metagenomes and save them to FASTA files +k = 1 +nrepgenomes = len(rep_chr.keys()) +for repname in rep_chr.keys(): + metagenome = "" + newname = repname.replace("(","_").replace(")","_").replace("/","_") + print ("processing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") + rep_chr_current = rep_chr[repname] + rep_start_current = rep_start[repname] + rep_end_current = rep_end[repname] + print ("-------> " + str(len(rep_chr[repname])) + " fragments") + for i in range(len(rep_chr[repname])): + try: + chr = rep_chr_current[i] + rstart = max(rep_start_current[i] - flankingl, 0) + rend = min(rep_end_current[i] + flankingl, lgenome[chr]-1) + metagenome = metagenome + spacer + genome[chr][rstart:(rend+1)] + except KeyError: + print ("Unrecognised Chromosome: "+chr) + pass + + # Convert metagenome to SeqRecord object (required by SeqIO.write) + record = SeqRecord(Seq(metagenome, IUPAC.unambiguous_dna), id = "repname", name = "", description = "") + print ("saving repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") + fastafilename = os.path.realpath(setup_folder + os.path.sep + newname + ".fa") + SeqIO.write(record, fastafilename, "fasta") + print ("indexing repgenome " + newname + ".fa" + " (" + str(k) + " of " + str(nrepgenomes) + ")") + command = shlex.split('bowtie-build -f ' + fastafilename + ' ' + setup_folder + os.path.sep + newname) + p = subprocess.Popen(command).communicate() + k += 1 + +print ("... Done") + diff -r 000000000000 -r 1435d142041b repenrich.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repenrich.xml Tue May 23 18:37:22 2017 -0400 @@ -0,0 +1,159 @@ + + Repeat Element Profiling + + bowtie + samtools + bedtools + biopython + + + + + ${input_base}_unique.bam && + samtools sort ${input_base}_unique.bam ${input_base}_unique_sorted && + mv ${input_base}_unique_sorted.bam ${input_base}_unique.bam && + samtools index ${input_base}_unique.bam && + rm ${input_base}_unique.sam && + python $__tool_directory__/RepEnrich.py $repeatmasker ${input_base} ${input_base} setup_folder_${baseReference} ${input_base}_multimap.fastq ${input_base}_unique.bam --cpus "\${GALAXY_SLOTS:-4}" && + cp $input_base/${input_base}_class_fraction_counts.txt class_fraction_counts.tabular && + cp $input_base/${input_base}_family_fraction_counts.txt family_fraction_counts.tabular && + cp $input_base/${input_base}_fraction_counts.txt fraction_counts.tabular + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Reads are mapped to the genome using the Bowtie1 aligner. Reads mapping uniquely to the genome are assigned to subfamilies of repetitive elements based on their degree of overlap to RepeatMasker annotated genomic instances of each repetitive element subfamily. Reads mapping to multiple locations are separately mapped to repetitive element assemblies – referred to as repetitive element psuedogenomes – built from RepeatMasker annotated genomic instances of repetitive element subfamilies. RepEnrich then return tables of counts merged from both strategies, that can be further processed in statistical analysis for differential expression. For detailed information see the `original publication`_. + +.. _original publication: https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-583 + +**Inputs** + +*Reference genome* : reference genome in fasta format + +*Sequencing dataset*: Single-reads sequencing dataset. Paired-end sequencing dataset in not implemented yet + +*RepeatMasker description file*: a txt repeatmasker file which can be downloaded from http://www.repeatmasker.org/genomicDatasets/RMGenomicDatasets.html + +This file looks like: + + + +Users may filter this file so that it contains only desired items (for instance only satellites, repeats and transposons) + +**Outputs** + +(1) Fraction counts, (2) Family fraction counts and (3) Class fraction counts are returned in tabular format, for further statistical tests differential expression analysis or graphics + +**RepEnrich** + +This Galaxy tool is a wrapper of the RepEnrich tool by steven_criscione@brown.edu et al. whose code and manual are available in `GitHub`_. + +.. _GitHub: https://github.com/nskvir/RepEnrich + +Python scripts RepEnrich.py and RepEnrich_setup.py have been adapted to python 3. Note that sorting of Fraction counts, Family fraction counts and Class fraction counts is different with this Galaxy wrapper or with RepEnrich as found in the `RepEnrich code repository`_. However, this different sorting does not affect subsequent statistical analyses + +.. _RepEnrich code repository: https://github.com/nskvir/RepEnrich + +**Execution time** + +.. class:: warningmark + +This tool includes steps to index the reference genome, index repeat sequences and align reads to these indexes. Therefore the run time may be **long to very long**. + +.. class:: infomark + +For more information on the tools, please visit our `code repository`_. + +If you would like to give us feedback or you run into any trouble, please send an email to artbio.ibps@gmail.com + +This tool wrapper is developed by the `ARTbio team`_ at the `Institut de Biologie Paris Seine (IBPS)`_. + +.. _code repository: https://github.com/ARTbio/tools-artbio/tree/master/tools/ +.. _ARTbio team: http://artbio.fr +.. _Institut de Biologie Paris Seine (IBPS): http://www.ibps.upmc.fr/en/core-facilities/bioinformatics + + + + + 10.1186/1471-2164-15-583 + + diff -r 000000000000 -r 1435d142041b test-data/Samp.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Samp.fastq Tue May 23 18:37:22 2017 -0400 @@ -0,0 +1,10040 @@ +@HISEQ:262:CA81LANXX:2:1101:1338:1996 +GCGGGTGATAAACTTCTGTGAAAAAAAGCTCAAAAAAATCTCACAAAAAATAAAACTTCTGATAAAATAAATAAAATTATTCCTCATCGTAAACCAATAGTTACTGCATAAGTATGTAATCCTTG ++ +BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFF +@HISEQ:262:CA81LANXX:2:1101:7367:1984 +GGGTCTTCTCGTCTTTTAAATAAATTTTAGCTTTTTGACTAAAAAATAAAATTCTATAAAAATTTTAAATGAAACAGTTAATATTTCGTCCAACCATTCATTCCAGCCTTCAATTAAAAGACTAA ++ +BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFchrM +AATGAATTGCCTGATAAAAAGGATTACCTTGATAGGGTAAATCATGCAGT +TTTCTGCATTCATTGACTGATTTATATATTATTTATAAAGATGATTTTAT +ATTTAATAGAATTAAACTATTTCTAAAAGTATCAAAAACTTTTGTGCATC +ATACACCAAAATATATTTACAAAAAGATAAGCTAATTAAGCTACTGGGTT +CATACCCCATTTATAAAGGTTATAATCCTTTTCTTTTTAATTTTTAATAA +TTCGTCAAAAATTTTATTTATTACAATTATAATTATTGGGACATTAATTA +CAGTTACATCTAATTCTTGGTTAGGAGCTTGAATAGGTTTAGAAATTAAT +TTATTATCTTTTATCCCCCTATTAAGAGATAATAATAATTTAATATCTAC +AGAAGCTTCTTTAAAATATTTTTTAACCCAAGTTTTAGCTTCAACTGTTT +TATTATTTTCTTCAATTTTATTAATATTAAAAAATAATATAAATAATGAA +ATTAATGAATCTTTTACATCCATAATTATTATATCAGCTTTATTATTAAA +AAGTGGAGCCGCTCCTTTCCATTTTTGATTTCCTAATATAATAGAAGGTT +TAACATGAATAAATGCTTTAATATTAATAACTTGACAAAAAATTGCACCT +TTAATATTAATTTCTTATCTTAATATTAAATATTTATTATTAATTAGAGT +AATTTTATCAGTTATTATTGGAGCTATTGGAGGATTAAATCAAACTTCTT +TACGAAAATTAATAGCATTTTCTTCAATTAATCATTTAGGGTGAATATTA +AGATCTTTAATAATTAGAGAATCAATTTGATTAATTTATTTTTTTTTTTA +TTCATTTTTATCATTTGTATTAACATTTATATTTAATATTTTTAAATTAT +TTCATTTAAATCAATTATTTTCTTGATTTGTTAATAGAAAAATTTTGAAA +TTTACATTATTTATAAATTTTTTATCATTAGGAGGATTACCTCCATTTTT +AGGATTTTTACCAAAATGACTTGTAATTCAACAATTAACATTATGTAATC +AATATTTTATATTAACACTTATAATAATATCAACTTTAATTACATTATTT +TTTTATTTACGAATTTGTTATTCCGCTTTTATAATAAATTATTTTGAAAA +TAACTGAATCATAAAGATAAATATAAATAGTATTAATTATAATATATATA +TAATTATAACttttttttcaatttttggattatttttaatttctttattt +tattttatattTTAAGGCTTTAAGTTAATAAAACTAATAACCTTCAAAGC +TATAAATAAAGAAATTTCTTTAAGCCTTAGTAAAACTTACTCCTTCAAAA +TTGCAGTTTGATATCATTATTGACTATAAGACCTAATTAATTTGTCCTTA +TTTGATTAAGAAGAATAAATCTTATATATAGATTTACAATCTATCGCCTA +AACTTCAGCCACTTAATCAATAATCGCGACAATGATTATTTTCTACAAAT +CATAAAGATATCGGAACTTTATATTTTATTTTTGGAGCTTGAGCTGGAAT +AGTTGGAACATCTTTAAGAATTTTAATTCGAGCTGAATTAGGACATCCTG +GAGCATTAATTGGAGATGATCAAATTTATAATGTAATTGTAACTGCACAT +GCTTTTATTATAATTTTTTTTATAGTTATACCTATTATAATTGGTGGATT +TGGAAATTGATTAGTGCCTTTAATATTAGGTGCTCCTGATATAGCATTCC +CACGAATAAATAATATAAGATTTTGACTTCTACCTCCTGCTCTTTCTTTA +CTATTAGTAAGTAGAATAGTTGAAAATGGAGCTGGGACAGGATGAACTGT +TTATCCACCTCTATCCGCTGGAATTGCTCATGGTGGAGCTTCAGTTGATT +TAGCTATTTTTTCTCTACATTTAGCAGGAATTTCTTCAATTTTAGGAGCT +GTAAATTTTATTACAACTGTAATTAATATACGATCAACAGGAATTTCATT +AGATCGTATACCTttatttgtttgatcagtagttattactgctttattat +tattattatCACTTCCAGTACTAGCAGGAGCTATTACTATATTATTAACA +GATCGAAATTTAAATACATCATTTTTTGACCCAGCGGGAGGAGGAGATCC +TATTTTATACCAACATTTATTTTGATTTTTTGGTCATCCTGAAGTTTATA +TTTTAATTTTACCTGGATTTGGAATAATTTCTCATATTATTAGACAAGAA +TCAGGAAAAAAGGAAACTTTTGGTTCTCTAGGAATAATTTATGCTATATT +AGCTATTGGATTATTAGGATTTATTGTATGAGCTCATCATATATTTACCG +TTGGAATAGATGTAGATACTCGAGCTTATTTTACCTCAGCTACTATAATT +ATTGCAGTTCCTACTGGAATTAAAATTTTTAGTTGATTAGCTACTTTACA +TGGAACTCAACTTTCTTATTCTCCAGCTATTTTATGAGCTTTAGGATTTG +TTTTTTTATTTACAGTAGGAGGATTAACAGGAGTTGTTTTAGCTAATTCA +TCAGTAGATATTATTTTACATGATACTTATTATGTAGTAGCTCATTTTCA +TTATGTTTTATCTATAGGAGCTGTATTTGCTATTATAGCAGGTTTTATTC +ACTGATACCCCTTATTTACTGGATTAACGTTAAATAATAAATGATTAAAA +AGTCATTTCATTATTATATTTATTGGAGTTAATTTAACATTTTTTCCTCA +ACATTTTTTAGGATTGGCTGGAATACCTCGACGTTATTCAGATTACCCAG +ATGCTTACACAACATGAAATATTGTATCAACTATTGGATCAACTATTTCA +TTATTAGGAATTTTATTCTTTTTTTTTATTATTTGAGAAAGTTTAGTATC +ACAACGACAAGTAATTTACCCAATTCAACTAAATTCATCAATTGAATGAT +ACCAAAATACTCCACCAGCTGAACATAGATATTCTGAATTACCACTTTTA +ACAAATTAATTTCTAATATGGCAGATTAGTGCAATAGATTTAAGCTCTAT +ATATAAAGTATTTTACTTTTATTAGAAAATAAATGTCTACATGAGCTAAT +TTAGGTTTACAAGATAGAGCTTCTCCTTTAATAGAACAATTAATTTTTTT +TCATGATCATGCATTATTAATTTTAGTAATAATTACAGTATTGGTGGGAT +ATTTAATATTTATATTATTTTTTAATAATTATGTAAATCGATTTCTTTTA +CATGGACAACTTATTGAAATAATTTGAACTATTTTACCAGCAATTATTTT +ACTATTTATTGCTCTTCCTTCTTTACGTTTACTTTATTTATTAGATGAAA +TTAATGAACCATCTGTAACTTTAAAAAGAATCGGCCATCAATGATATTGA +AGTTACGAATATTCAGATTTTAATAATATTGAATTTGATTCATATATAAT +TCCAACAAATGAATTAATAACTGATGGATTTCGATTATTAGATGTTGATA +ACCGAGTAGTTTTACCCATAAACTCACAAATTCGAATTTTAGTAACAGCT +GCTGATGTTATTCATTCTTGAACAGTACCTGCTTTAGGAGTAAAAGTTGA +CGGTACACCTGGACGATTAAATCAAACTAATTTTTTTATTAATCGACCGG +GTTTATTTTATGGTCAATGTTCAGAAATCTGTGGAGCTAATCATAGATTT +ATACCGATTGTAATTGAAAGTGTTCCTGTAAATTACTTTATTAAATGAAT +TTCTAGAAATAACTCTTCATTAGATGACTGAAAGCAAGTACTGGTCTCTT +AAACCATTTAATAGTAAATTAGCACTTACTTCTAATGATAAAAAATTAGT +TAAAATCATAACATTAGTATGTCAAACTAAAATTATTAAATAATTAATAT +TTTTTAATTCCACAAATAGCACCtattagatgattattattatttattat +tttttctattacatttattttattttgttctattaactattattcttatA +TACCAAATTCACCTAAATCTAATGAATTAAAAAATATCAACTTAAATTCA +ATAAATTGAAAATGATAACAAATTTATTTTCTGTATTCGACCCCTCAGCT +ATTTTTAATTTTTCACTTAATTGATTAAGAACATTTTTAGGACTTTTAAT +AATTCCGTCAATTTATTGATTAATACCTTCTCGTTACAATATTATATGAA +ATTCAATTTTATTAACTCTTCATAAAGAATTTAAAACTTTATTAGGCCCA +TCAGGTCATAATGGATCTACTTTTATTTTTATTTCTTTATTTTCATTAAT +TTTATTTAATAATTTCATAGGATTATTTCCATATATTTTTACAAGAACAA +GACATTTAACTTTAACTTTATCTTTAGCTTTACCTTTATGATTATGTTTT +ATATTATATGGATGAATTAATCATACACAACATATATTTGCTCATTTAGT +TCCTCAAGGAACACCCGCTATTCTTATACCTTTTATAGTATGTATTGAAA +CTATTAGAAATATTATTCGACCTGGAACATTAGCTGTTCGATTAACTGCT +AATATAATTGCTGGACATTTATTATTAACTCTTTTAGGAAATACAGGACC +TTCTATATCTTATATTTTAGTAACATTTTTATTAATAGCTCAAATTGCTT +TATTAGTATTAGAATCAGCTGTAGCTATAATTCAATCTTATGTGTTTGCT +GTATTAAGAACTTTATATTCTAGAGAAGTAAATTAATGTCTACACACTCA +AATCACCCTTTTCATTTAGTGGATTATAGTCCATGACCATTAACAGGAGC +TATCGGAGCTATAACAACTGTATCAGGTATAGTAAAATGATTTCATCAAT +ATGATATTTCATTATTTGTATTAGGTAATATTATTACTATTTTAACTGTA +TATCAATGATGACGAGATGTATCACGAGAAGGAACATACCAAGGATTACA +TACTTATGCAGTAACTATTGGTTTACGATGAGGAATAATTTTATTTATTT +TATCAGAAGTTTTATTTTTTGTGAGATTTTTTTGAGCTTTTTTTCACAGA +AGTTTATCACCCGCTATTGAATTAGGAGCATCATGACCTCCTATAGGAAT +TATCTCATTTAATCCATTTCAAATTCCTTTATTAAATACAGCTATTTTAT +TAGCTTCAGGAGTTACTGTAACTTGAGCCCACCATAGACTTATAGAAAAT +AATCATTCACAGACTACTCAAGGATTATTTTTTACAGTTTTACTAGGAAT +CTATTTTACAATTCTTCAAGCTTATGAATATATTGAAGCTCCATTTACTA +TTGCAGACTCAATTTATGGATCAACATTTTTTATAGCAACAGGATTTCAC +GGAATTCATGTATTAATCGGAACAACTTTTTTATTAGTATGTTTACTACG +ACATTTAAATAATCACTTCTCAAAAAATCATCATTTTGGTTTTGAAGCAG +CTGCATGATATTGACATTTTGTCGATGTAGTTTGATTATTTTTATATATC +ACAATTTACTGATGAGGAGGATAATTATATTATTAATTAAATATCTATAT +AGTATAAAAGTATATTTGACTTCCAATCATAAGGTCTATTAATTAATAGT +ATAGATAATTTTTTCTATTATTTTTATTGCTTTATTAATTTTACTAATTA +CAACTATTGTTATATTTTTAGCTTCAATTTTATCAAAAAAAGCTTTAATC +GACCGAGAAAAAAGATCCCCATTTGAATGTGGATTTGATCCAAAATCTTC +ATCTCGATTACCATTTTCTTTACGTTTTTTTTTAATTACTATTATTTTTT +TAATTTTTGATGTAGAGATTGCATTAATTCTACCTATAATTATTATTATA +AAATATTCTAATATTATAATTTGAACAATTACTTCAATTATTTTTATTTT +AATTTTATTAATTGGATTATACCATGAATGAAATCAAGGAATGTTAAATT +GATCAAACTAatatatttatatatatatatataGGGTTGTAGTTAAATAT +AACATTTGATTTGCATTCAAAAAGTATTGAATATTCAATCTACCTTATTA +ATTTAATAACTGAATATGAAGCGATTGATTGCAATTAGTTTCGACCTAAT +CTTAGGTAATTATACCCTTATTCTTTAATTGAAGCCAAAAAGAGGCATAT +CACTGTTAATGATATAATTGAATTTTAAATTCCAATTAAGGAAATATGAT +GATCAAGTAAAAGCTGCTAACTTTTTTCTTTTAATGGTTAAATTCCattt +atatttctatttatatagtttaaataaaaccttacattttcattgtaata +ataaaatcttatatttttatAAATTACTAAAATTAATTCACTATATCCAA +AGATTTAATAATCTCCATAACATCTTCAATGTCAAACTCTAGTATAAGCT +ATTTGGATATAAAAATAATAAAATTAATAAAATTAAAATTCAAAATACAA +ATAATAATAAATAAATTTTCAAAGAATTATTATGTATTAAAAATAAAGTT +TTAGAATATATAGATAATTTTTGATATAAATGTTGACCTCCAAAATATTC +TGATCAACCTTGATCAAAACTTTTTACAACTAATTGACCATAATTTAAAG +GATAAAAAATTATACCATAAGTTCTAATATAAGGTATAAATCATATAGAC +CCTAAAAAAGTTCTTAAATTATATATAAATAAAGATTTATTTAAAAAAAA +TAAATTTCTTAAAGAAATTAAATATCCAAATAAACCCCCTACAATACATA +CAAATAATGTTAACAATTTTATATAAATAGGTAAACAAATTATATAAGGA +AAAGGAAAAATCAATCAATTTAATATTCTACCTCCAATAATTCTTATAAT +TAATAATCCTATTATACCACGGAGTATAATTCAACTTTCATCATTTAATA +TATTCAATCTACCGCAATTTAAATCACCGGTTATTGAATAATAAACTAAT +CGAAATGAATAACTAACAGTTAAACCCGTAGAAAAATAGTATAAAAAAAA +TGAAAACATATTAACATTTCTAATTCTAACAATTTCTAAAATTATATCCT +TAGAATAGAATCCAGCTAAAAAAGGTATTCCACATAAAGCTAAATTAGAT +ACGTTAAAACAAGCTGAAGTTAAAGGTATATGAATTCTTAACCCCCCTAT +TAAACGAATATCTTGAGAATTATTTATATTATGAATAATAGCCCCAGCAC +ATATAAACAATAATGCTTTAAATAAAGCATGAGTTAATAAATGAAATATA +GCTAATTTTAAAAATCCTATAGACAAAATTCTTATTATTAAACCTAATTG +ACTTAAAGTAGATAAAGCAATAATTTTTTTTAAATCAAATTCAAAATTAG +CTCCTAATCCAGCTATAAATATTGTTAATCCAGATAATAATAATATTAAT +TGTCCTAACCAAGAAGTTCTTAAGATAATATTAAATCGAATTAATAAATA +TACACCAGCTGTAACTAATGTAGAAGAATGAACTAAAGCAGAAACAGGTG +TAGGAGCAGCTATAGCTGCAGGTAACCAAGAAGAAAAAGGAATCTGAGCT +CTTTTAGTTATAGCAGCTAATATTACTAATCTTCCAATTATTAACATTTC +AAATTCATTTTGTATAATTTCTAAATAAAAAATATAATTTCATCTTCCAT +AATTTAATATTCAAGCAATAGAAAGAAGTAAAGCTACATCCCCAATTCGA +TTAGATAACGCAGTTAATATACCAGCATTATAAGATTTAATATTTTGAAA +ATAAATTACTAAACAATAAGAAACAAGTCCTAAACCATCTCACCCTAATA +AAATTCTAATTAAATTTGGTCTAATAATTAACAATATTATTGATAAAACA +AATATTAATACTAATATAATGAATCGATTAATATGATTATCATTTATTAT +GTATTCTTTTCTATAAAAAATCACTAAAGAAGAAATTATAAGAACAAAAG +ATATAAATAATAAACTTATTCAATCAAAAAGAAAAGTTATAACAATTCTT +ATAGAATTTAAAGAAACTAATTCTCACTCAATAAAATAAATCATATCATT +TAACAAAAAATATAAACTTAATAAAAAACATGATAAACTTATAGAAATTA +AATTAACAAATCTAATTCTACAAATAGATAAATATTTCATGATTTAAAAT +GAATATTTTCATATCACTAACACCACAAATTAGTATTTTTTTTAAACTAT +TTAAATATAATCATAATATAAATGATTCTCTTTTTAAAATTAATAAATTT +AAAGGCAATCAATGTAACAATATTAATAAATATTCTCGAATTTTACCTCT +TCTAAATGAATATACTCCAGAAAATAATTTACCATGCTGACTAAAAGAAT +ATAAATATAAAGTATAAGCAGCTCTAAAAAAAGATAAAAAAGATAATAAA +ATTATAGAAATTCAAGATCAAGAAACAATTCTATTTAATAAATAAATTTC +TCCTAATAAATTTAATGTTGGAGGAGCTGCTATATTAGCTGATCTTAATA +AAAATCATCATAAAGTTATCGAAGGTATAAAATTTAATAAACCTTTATTA +ATTAATATTCTTCGACTTCCAAGACGTTCATAAGATACATTAGCTAAACA +AAATAACCCAGAAGAACATAAACCATGAGCAATTATTAATGTATAAGAAC +CACATAAACCTCAATAAGTTATAGTTAAAAGTCCTGATAGAACAATTCCT +ATATGAGCAACAGATGAATAAGCAATTAAAGCCTTTAAATCAGTTTGACG +TAAACAAACTAATCTAACTAATACACCTCCTActaatctaattctaattc +aaacaaatctatacttcaaattTATTAACTGTAAAAAACTAATAACTCGT +AATATTCCATAACCTCCTAATTTTAATATAATACCTGCTAAAATTATAGA +CCCAGAAACTGGAGCTTCAACATGAGCTTTAGGTAATCATAAATGAACTA +AAAATATTGGTATTTTTACTAAAAAGGCACACAATAAACAAAAATATAAT +AAATCGTAATTAAACATAAAATTATTTATTAAATAAAAATTTATAGAACC +AATTTTATTTATTaaataaaaaataccaattaatataggtaaagaaacta +ataaagtataaaataataaatataaaCCAGCTTGTAAACGTTCTGGCTGA +TAACCTCAACCTAAAATTAAAAATAATGTAGGAATTAGTCTTCTTTCAAA +AAATAAATAAAATATAAATAATCTTATTCTTGAAAAAGTTAAAATCAACA +ATAATAATAAAATAATAATATTTAATAAAAATAAATTTTTATAATTATTA +TGTTTATTAATTATTTCTCTAGCTAATAATATTAATGAACAAATTCATAA +ACTTAATAAAATTAATCCATAAGATAATATATCACAACCTAAAAAATAAG +AAATTTCTGATCAATAATTTATAAAATTATTTATTAATAAAAAAATAAAT +CTAATAAAAAATATTATAATTTGTACCATTCAATATATATTATTAATAAA +ACAAAAAGGAATTAAAAATAATAAAAAAAAAATAATTTTTAACATTATAT +AATTCTAAAAGATTGAAAATAATCATTACCATGAGTACGAATTATAGAAA +CTAAAATTGATAAACCTAAGGCCCCTTCACATACTCTAAATGTCAAAAAT +ATTATTCTAAAATAACTTTCATAATTTAATATATTTAAATAAATAAATAA +TATAAAAAATAATATTAAAACAATAAATTCTAAACTTAAAAGTATTGAAA +GTAAATGTTTCCGATTAGAAACAAAACAAAATAACCCTAAAATAAATAAA +ATTATAGGTAAACTTCAATATAAAATTATAATCATTAGTTTTAATAGTTT +AATAAAAACATTGGTCTTGTAAATCAAAAATAAGATTATTTCTTTTAAAA +CTTCAAGAGAAAAGAAATTTCTTTTTCATTAATCCCCAAAATTAATATTT +TAAATAAACTACCTCTTGAAATTATTCAATTAATATTATATTCATTAATT +ATTACTACTTCCATTATTTTTCTAAATATAATTCATCCATTAGCTTTAGG +ATTAACTTTATTAATTCAAACAATTTTTGTATGTTTACTAACTGGATTAA +TAACTAAAAGTTTTTGATATTCATATATTTTATTTTTAATTTTTTTAGGA +GGAATACTTGTATTATTTATTTACGTAACATCTTTAGCCTCTAATGAAAT +ATTTAATTTATCAATAAAATTAACTCTATTTTCTTCATTAATTTTAATTT +TTATATTAATTTTATCATTTATTATAGATAAAACTTCTTCTTCTTTATTT +TTAATAAATAATGATATACAATCTATTATTAATATAAATTCTTATTTTAT +AGAAAATTCTTTATCTTTAAATAAATTATATAATTTTCCTACAAATTTTA +TTACAATTTTATTAATAAATTATTTATTAATTACTTTAATTGTTATTGTA +AAAATTACAAAATTATTTAAAGGACCTATTCGAATAATATCTTAATTAAT +GAATAAACCTTTACGAAATTCCCATCCTCTATTTAAAATTGCCAATAATG +CTTTAGTAGATTTACCAGCTCCAATTAATATTTCAAGATGATGAAATTTT +GGATCATTACTTGGATTATGTTTAATTATTCAAATTTTAACCGGATTATT +TTTAGCTATACATTACACAGCTGATATTAATCTAGCTTTCTATAGTGTTA +ATCATATTTGTCGAGACGTTAATTATGGTTGATTATTACGAACTTTACAT +GCTAACGGTGCATCATTTTTTTTTATTTGTATTTACTTACATGTAGGACG +AGGAATTTATTACGGTTCATATAAATTTACTCCAACTTGATTAATTGGAG +TAATTATTTTATTTTTAGTAATAGGAACAGCTTTTATAGGATACGTATTA +CCTTGAGGACAAATATCATTTTGAGGAGCTACTGTAATTACTAATTTATT +ATCAGCTATCCCTTACTTAGGTATAGATTTAGTTCAATGATTATGAGGTG +GATTTGCTGTTGATAATGCCACTTTAACTCGATTTTTTACATTCCATTTT +ATTTTACCTTTTATTGTTCTTGCTATAACTATAATTCATTTATTATTCCT +TCATCAAACAGGATCTAATAATCCTATCGGATTAAATTCTAATATTGATA +AAATTCCTTTTCATCCTTATTTTACATTTAAAGATATTGTAGGATTTATT +GTAATAATTTTTATTTTAATTTCATTAGTATTAATTAGACCAAATTTATT +GGGAGACCCTGATAATTTTATTCCAGCAAATCCTTTAGTAACACCTGCCC +ATATTCAACCAGAATGATATTTTTTATTTGCTTATGCTATTTTACGATCT +ATTCCAAATAAATTAGGAGGAGTTATTGCATTAGTTTTATCAATTGCAAT +TTTAATAATCCTTCCTTTTTATAATTTAAGAAAATTCCGAGGGATTCAAT +TTTATCCTATTAATCAAGTAATATTCTGATCTATATTAGTAACAGTAATT +TTATTAACTTGAATTGGAGCTCGACCAGTTGAAGAACCTTATGTATTAAT +TGGACAAATTCTAACTGTTGTATATTTCTTATATTATTTAGTAAACCCAT +TAATTACAAAATGATGAGATAATTTATTAAATTAAATAGTTAATGAGCTT +GAATAAGCATATGTTTTGAAAACATAAGATAGAATTTAATTTTCTATTAA +CTTTTACTAAAAAAAATTCACTataataaagaaaataataaaattttaaa +cccaataaaaaataataaataatTTAAAGAAAAAGATAAAAAACATTTTC +AAGCTAAATATATTAATTTATCATAACGAAATCGAGGTAAAGTTCCTCGA +ACTCAAATAAAAACAAAAGAAATAAAAGTTAATTTTATATAAAATAATAA +ATTAAACACATCACAACCTAAAAAAATAACGCAAAATAATATTCTTATAA +ATAAAATTCTCGCATATTCAGCTATAAAAATTAAAGCAAAACCCCCTCTT +CTATATTCTACATTAAATCCTGAAACTAATTCTGATTCTCCTTCAGCAAA +ATCAAAAGGAGTCCGATTAGTTTCAGCTAATGAAATAGATATTCAAACTA +AAGCTATAGGAAATAAAATAATTAAAAATCACATATAAACTTGATAAAAA +AAAAAATAAATTATATTATAACTTCCAATTAAAAAAATAAAAGATAATAA +AATTAAAGCTAAACTAACTTCATAAGAAATAGTCTGAGCCACAGCTCGCA +AACCTCCTAATAAAGCATAATTAGAATTAGACGACCAACCAGCTACTATA +ACAGTATAAACCCCCAATCTAGTACAACATAAAAAAAATAAACCCCCCAA +ATTAAAAGAATATAATTTTACAAAAAAAGGTATACATATTCAAACAAATA +ATGATAAAAATAAAGAAAAAATTGGAGAAATATAATATCTTAAATAATTA +GATAATAAAGGATAAGTTTGTTCTTTTGTAAATAATTTAATCGCATCACA +AAAAGGTTGAGGAATTCCTATTAAACCAACTTTATTAGGACCTTTACGAA +TTTGAATATATCCTAAAACTTTTCGTTCTAATAAAGTTAAAAAAGCTACA +CTTACTAATACACAAATAATTAATAACAAACTACCAATTAATGACAAAAT +AAATTCTATATAAAACAAGTACTATTTGTAATAAAAATCACATATATAAA +TTCTAAATTTATTGCACTAATCTGCCAAAATAGTTTTATATTAATAATAT +TCTTATAAAAAATATAATTATTTTGATATTTGGTCCTTTCGTACTAAAAT +ATCATAATTTTTTAAAGATAGAAACCAACCTGGCTTACACCGGTTTGAAC +TCAGATCATGTAAGAATTTAAAAGTCGAACAGACTTAAAATTTGAACGGC +TACACCCAAAATTATATCTTAATCCAACATCGAGGTCGCAATCTTTTTTA +TCGATATGAACTCTCCAAAAAAATTACGCTGTTATCCCTAAAGTAACTTA +ATTTTTTAATCATTATTAATGGATCAAATATTCATAAATTTATGTTTTTA +AAAAATTAAAAGTTTTTTAAATTTTAATATCACCCCAATAAAATATTTTT +ATTTATTAAAATTTAATTAATCTATATAATTAAAATAAAAAAAAATATAA +AGATTTATAGGGTCTTCTCGTCTTTTAAATAAATTTTAGCTTTTTGACTA +AAAAATAAAATTCTATAAAAATTTTAAATGAAACAGTTAATATTTCGTCC +AACCATTCATTCCAGCCTTCAATTAAAAGACTAATGATTATGCTACCTTT +GCACAGTCAAAATACTGCGGCCATTTAAAATTTTCAGTGGGCAGGTTAGA +CTTTATATATAATTCAAAAAGACATGTTTTTGTTAAACAGGCGAATATTA +TTTTTGCCGAATTCTTTATTTAAACTTTTCATATAAATTAATTTTAACAT +TATTATATACTAATTTTATCATTATTACTTAATTTTAATAATTAAAACTA +ACATTTTAATAAATAATTAAAATTTAATAAATAATTTAATTTATAAAATA +AATTATAACATATTTTTTAATAATTGCTAATTCTAAGCATATATTTATTA +AATCTATTTAATATTTTTAAAAATTTATTTTATAGCTTATCCCATAAAAC +ATTAAAATTATAAATTAATTAATTAAATAAATAATTAAGTAAATTTATAA +TTTCTAAATTAAATTTATTTCTTAAAAAACTAGATACCTTTAAAAACGAA +TAACATTTCATTTCTAATATAATATTATAAATAATTTTATCACATTAACT +TAAATATTATATTAACTCTTTTAAAATCGAGAAAAATAAATATTTATTTT +TTATTTAATAAACACTGATACACAAGGTACAATAAATTAAATTTTCTTTT +AAAATAAAATTTTTTCAAATTATTTCAATTTTCTTTTACAATACTAATAA +ACTATTATTAAAATTATTTTTTCTTTAAACAATACTAAAACTTTAAATTT +TATAGTTATTTCTAATAATTTTTTAAAAAATAATAAAAATTAATAAATAA +AAACTAACTCAATTTATATTGATTTGCACAAAAATCTTTTCAATGTAAAT +GAAATACTTTACTTAATAAGCTTTAAATTGTCATTCTAGATACACTTTCC +AGTACATCTACTATGTTACGACTTATCTTACCTTAATAATAAGAGCGACG +GGCGATGTGTACATATTTTAGAGCTAAAATCAAATTATTAATCTTTATAA +TTTTACTACTAAATCCACTTTCAAAAATTTTTTCATAATTTTATTCATAT +AAATAAATTTATTGTAACCCATTATTACTTAAATATAAGCTACACCTTGA +TCTGATATAAatttttattaaaattattgaatattattattcttatAAAA +TATTCTGATAACGACGGTATATAAACTGATTACAAATTTAAGTAAGGTCC +ATCGTGGATTATCGATTAAAAAACAGGTTCCTCTAGATAGACTAAAATAC +CGCCAAATTTTTTAAGTTTCAAGAACATAACTATTACTACTTTAGCAATT +TATTTACATTTTAAATAATAGGGTATCTAATCCTAGTTTTTTATTAAAAT +TTTTTAACCTCAATTACATTTTTATATAATAATTTAAATATAAAATTTCA +CTTAATATATTTAATTTTATTATTATTAATAAATTTAATTTAATTAATAC +TAAAAAAATTTATTTGTATTAATGGTATAACCGCGACTGCTGGCACCAAT +TTAGTCAATACTTTTTTATATTGCTATTTCTAAATTTCTTTAATTAATAA +TATTAATTACTGCGAATAAATTTTCATATTTATTTTTTAAATAAATATAA +AATCACACAAAAATTTACATATAAATCAAATTAATAACAAATTTTTAAGC +CAAAATAAAACTTTAAATTTTTATTTTTGATTTTTTATTATTAATTAAAT +ATTAATAATTTTTATTAAAATAATTTTTTAAAGAAAAATTAAAATTAATT +TTAATTAAATATTAAAATAATTTAATTTTATAATAAAATTTTTATCATAT +TATAATAATATAAAAATTTTATAAATTTATTTTTTAAATTTTACAAAATT +TTTAAAATTTTTATTTTTTTTAAAAAAAATAATTTTTAACAAAAAAAATT +TTTATCAAAAATTAATATAAAATAAATTTTAATTTAAAAATTAAAAATTT +TAATTTTACACTTTTTTAAAAATATTTTTTTTTAAAAAAAAAATTTTTTT +TTAAAAAAATTTTTTTTTAAAAAAAATTTAAAAAATTATAGATTAATTTC +TTTTAAATGACTAAAAAAAATTTTTTTTTTTAAGTATTTTAAAACTTTTT +TTTTACAATTTTTAAAAAAATATATAAATATAAATTTTAAAAAAAATTTT +TTTTTTAAAAAAAATGAAAATTATATTATAAAAATATTTTTTTTACAAAA +ATGAAAATTTAATCTATTAAAAAAAATTATTAAAATTTTTATAAATAAAT +AAAAAAAGTAATAAATTTATTAAAAATCAATATATATATAATAATAAATA +ATTTGATTATTAATTAAATTATACGAATAATAAATATAATAAATAATTTA +TTTTAATCAATAAATCTGAAATAATTAATTATATACATATATATATATAT +GTAAATAAATAAAAATAAATTTATTCCCCCTATTTATAAATTTATTATAT +AATTAAAACTTAAAAAATATTTTTTTTAAAAAAATAGTTTATTAAATTAT +ACTTAATAAACTATTTTTATAATAAATTATTTTATAAATAAAATTATTTA +AAATAATTAATAAAAATATTTTTATTGTAATAAAAATTAAAAATAATTTT +AAAAAAATTAAATTTATATATTTATATATATATATATATAATTTTTAATT +TTCAATTAAATTATATAAATATAATAAAATAATTTTATTTAATCACTAAA +TCTGAAATAATTAATtataaatatatatatatatatatatatatatatat +atatataAATGAAAATAAATTTATTCCCCCTATTCATAAATTTATTGTAT +AATTAAAACTTAAAAAATATTTTTTTTTAAAAAAAAATTATTTATTAAAT +TATACTTAATAAACTATTTTTATAATAAATTATTTTATAAATAAAATTAT +TTTAAATAATTAATAAAAATATTTTTAATATAATAAAAATTTAAAATGAT +TTTTTATAAAAATTAAATTCATATTTATATATATATATATATAATTTAAT +TTTCAATTAAATTATATAAGTATAATAAAATAATTTATTTTAATCACTAA +ATCTGAATTAATTAATTGTATATATATATATATATATAAAAAAAATGAAA +ATAAATTTATTCCCCCTATTCATAAATTTATTATATAATTAAATCTTAAA +AAGTATTTTTTTTTTAAAAAAAAATTATTTATTAAATTATACTTAATAAA +CTATTTTTATAATAAATTATTTTATGAATAAAATTATTTAAAATAATTTA +TAAAAATATTTTTAATATAATGAAAATTTAAAATGATTTTTTATTATTAA +TTAAATTCatatatttacatatatatatatatatatatatatatataGAT +AATTTAATTTTCAATTAAATTATATAAGTATAATAAAATAATTTATTTTA +ATCACTAAATCTGAATTAATTAATTGTATGTATATATATATATATATATA +TAAAAAAATGAAAATAAATTTATTCCCCCTATTCATAAATTTATTATATA +ATTAAATCTTAAAAAGTATTTTTTTTTAAAAAAAAAATTATTTATTAAAT +TATACTTAATAAACTATTTTTATAATAAATTATTTTATGAATAAAATTAT +TTAAAATAATTAATAAAAATATTTTTAATATAATGAAAATTTAAAATGAT +TTTTTATTATTAATTAAATTCatatatttatatatatatatatatatata +tatataGATAATTTAATTTTCAATTAAATTATATAAGTATAATAAAATAA +TTTATTTTAATCACTAAATCTGAATTAATTAATTGTATGtatatatatat +atatatatatatatatatataAAAAATGAAAATAAATTTATTCCCCCTAT +TCATAAATTTATTATATAATTAAATCTTAAAAAGTATTTTTTTTTAAAAA +AAAATTATTTATTAAATTATACTTAATAAACTATTTTTATAATAAATTAT +TTTATGAATAAAATTATTTAAAATAATTAATAAAAATTATATATATATAT +AAATGAAAATAATTTTTAAATTTTAATAATAAATAAATTTAATAATTAAT +AATTAAATAAAATCTATTCATTATTAATATTTAATTAATAATAAATAAAT +TTAATAACTAATAATTAAATAAAATTTATTTATTATTAATATTTAATTAA +TAATAAAAAATCATCAttttttttttttttttttttatttAATTAATTAT +tatatatttataaatttatatattattcaatatttataatatatatatat +atatatatatataAAAATTAAATTATTTAAATAATTTAATATAAATTTTT +AAAAAATTTCTTAAATGTATTATTTTTATAAAAAATATTTATATAATAAA +ATTATTTTTTTTTAAAAATAAACAAAAAATTTTTAATAAATAAATTTTAT +AATGAAATATAATTTATTTATTTTTTATTTTTTTAAAAAAAATTTAAAAA +AAAATAATTTTTTTTTAAAAAAAAACTATATACTAATTATAAATTAATAG +ATATTTATATATATATAAATATTTAATATATTATTATATATCTAATAATT +TAAATAAAAAATTTTAAAATTTAAAAATGTAGATATAATTTATAAAAATT +TATATTCTCATATTTATTTATTATTAATTTAATTTATATAAATAATATAA +TAATTTAATTAATTATTATATATTTATAAATTTATATATTATTGAATATT +TATATAATATATATATATATATAGAAAAATTAAATTATTTAAATAATTTA +ATATAAATTTTTAAAAAATTTCTTAAATGTATTATTTTTATAAAAAATAT +TTATATAATAAAATCATTTTTTTTTAAAAATAAACAAAAAATTTTTAATA +AATAAATTTTATAATGAAATATAATTTATTTATTTTTTATTTTTTTAAAA +AAAAATTTTTTAAAAAAAAATAATTTTTTTTTTAAAAAAACTATATACTA +ATTATAAATTAATAGATATTTATATATATATAAATATTTAATATATTATT +ATATATCTAATAATTTAAATAAAAAATTTTAAAATTTAAAAATGTAGATA +TAATTTATAAAAATTTATATTCTCATATTTATTTATTATTAATTTAATTT +ATATAAATAATATAATAATTTAATTAATTATTATATATTTATAAATTTAT +ATATTATTGAATATTTATATAATATATATATATATATAGAAAAATTAAAT +TATTTAAATAATTTAATATAAATTTTTAAAAAATTTCTTAAATGTATTAT +TTTTATAAAAAATATTTATATAATAAAATCATTTTTTTTAAAAATAAACA +AAAAATTTTTAATAAATAAATTTTATAATGAAATATAATTTATTTATTTT +TTATTTTTTTAAAAAAAATTTTTTAAAAAAAAATAATTTTTTTTTAAAAA +AACTATATACTAAATATAAATTAATAGATATTTATATATATATAAATATT +TAATATATTATTATATATCTAATAATTTAAATAAAAAATTTTAAAATTTA +AAAATGTAGATATAATTTATAAAAATTTATATTCTCATATTTATTTATTA +TTAATTTAATTTATATAAATAATATAATAATTTAATTAATTATTATATAT +TTATAAATTTATATATTATTGAATATTTATATAATATATATATATATATA +GAAAAATTAAATTATTTAAATAATTTAATATAAATTTTTAAAAAATTTCT +TAAATGTATTATTTTTATAAAAAATATTTATATAATAAAATCATTTTTTT +TTAAAAATAAACAAAAAATTTTTAATAAATAAATTTTATAATGAAATATA +ATTTATTTATTTTTTATTTTTTTTAAAAAAAATTTTTTAAAAAAAATAAT +TTTTTTTTAAAAAAACTATATACTAAATATAAATTAATAGATATTTATAT +ATATATAAATATTTAATATATTATTATATATCTAATAATTTAAATAAAAA +ATTTTAAAATTTAAAAATGTAGATATAATTTATAAAAATTTATATTCTCA +TATTTATTTATTATTAATTTAATTTATATAAATAATATAATAATTTAATT +AATTATTATATATTTATAAATTTATATATTATTGAATATTTATATATAAT +ATATATATATATAGAAAAATAAAATTATTTAAATAATTTTACATAAAATT +TTAAAAAATTTCTTAAATGTATTATTTAATAAAAAATTACTTTTTAAAAA +AAATAATTTTAATTTTTTaaaaaaaatagtaaataataaaaaaaaaaaaa +aaaaaaaatgaaaaTTATATTATT diff -r 000000000000 -r 1435d142041b test-data/chrM_repeatmasker.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chrM_repeatmasker.txt Tue May 23 18:37:22 2017 -0400 @@ -0,0 +1,21 @@ + SW perc perc perc query position in query matching repeat position in repeat +score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID + + 16 20.2 5.9 0.0 chrM 1211 1261 (18263) + (TTTTA)n Simple_repeat 1 54 (0) 84486 + 13 23.9 2.2 2.2 chrM 2014 2059 (17465) + (TTA)n Simple_repeat 1 46 (0) 84487 + 24 18.8 5.3 2.6 chrM 3924 3999 (15525) + (TAT)n Simple_repeat 1 78 (0) 84488 + 18 4.5 0.0 0.0 chrM 5961 5983 (13541) + (AT)n Simple_repeat 1 23 (0) 84489 + 13 25.9 4.0 4.0 chrM 6247 6320 (13204) + (ATTTAT)n Simple_repeat 1 74 (0) 84490 + 11 14.6 7.5 2.4 chrM 8783 8822 (10702) + (CTAATT)n Simple_repeat 1 42 (0) 84491 + 17 19.0 0.0 8.6 chrM 9064 9126 (10398) + A-rich Low_complexity 1 58 (0) 84492 + 13 21.0 5.9 1.9 chrM 11723 11773 (7751) + (ATA)n Simple_repeat 1 53 (0) 84493 + 66 20.4 12.3 12.3 chrM 12823 13001 (6523) C LSU-rRNA_Cel rRNA (1) 2431 2253 84494 + 16 16.6 0.0 2.9 chrM 14361 14396 (5128) + (ATT)n Simple_repeat 1 35 (0) 84495 + 44 2.4 0.0 0.0 chrM 15966 16007 (3517) + (TA)n Simple_repeat 1 42 (0) 84496 + 35 5.3 0.0 0.0 chrM 16559 16597 (2927) + (AT)n Simple_repeat 1 39 (0) 84497 + 36 2.9 0.0 0.0 chrM 16922 16956 (2568) + (AT)n Simple_repeat 1 35 (0) 84498 + 37 0.0 0.0 0.0 chrM 17040 17071 (2453) + (TA)n Simple_repeat 1 32 (0) 84499 + 20 4.3 0.0 0.0 chrM 17417 17440 (2084) + (T)n Simple_repeat 1 24 (0) 84500 + 31 6.9 6.3 1.5 chrM 17451 17513 (2011) + (TA)n Simple_repeat 1 66 (0) 84501 + 26 17.0 0.0 0.0 chrM 19469 19514 (10) + A-rich Low_complexity 1 46 (0) 84502 + diff -r 000000000000 -r 1435d142041b test-data/tool_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tool_wrapper.sh Tue May 23 18:37:22 2017 -0400 @@ -0,0 +1,19 @@ +input_base=$1 +baseReference=$2 + +bowtie-build ${baseReference}.fa ${baseReference} + +python RepEnrich_setup.py ${baseReference}_repeatmasker.txt ${baseReference}.fa \ + setup_folder_${baseReference} + +bowtie $baseReference -p 16 -t -m 1 -S --max ${input_base}_multimap.fastq \ + ${input_base}.fastq ${input_base}_unique.sam + +samtools view -bS ${input_base}_unique.sam > ${input_base}_unique.bam +samtools sort ${input_base}_unique.bam ${input_base}_unique_sorted +mv ${input_base}_unique_sorted.bam ${input_base}_unique.bam +samtools index ${input_base}_unique.bam +rm ${input_base}_unique.sam + +python RepEnrich.py ${baseReference}_repeatmasker.txt ${input_base} ${input_base} \ + setup_folder_${baseReference} ${input_base}_multimap.fastq ${input_base}_unique.bam --cpus 16