# HG changeset patch # User petr-novak # Date 1560341949 14400 # Node ID 2e811f988e1d837b6e4d570c2dd03db3ad24046a # Parent a4cd8608ef6b6c67474a7df0b21891c72ea01b56 Uploaded diff -r a4cd8608ef6b -r 2e811f988e1d Galaxy_integration.org --- a/Galaxy_integration.org Mon Apr 01 07:56:36 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ - -#+BEGIN_SRC sh -/home/petr/anaconda3/bin/planemo shed_init --name=repeatexplorer_utilities \ - --owner=repeatexplorer \ - --description="some utilities for data preprocessing" \ - --long_description="some utilities for data preprocessing" \ - --category="Fasta Manipulation" -#+END_SRC -# this create file .shed.yml - - diff -r a4cd8608ef6b -r 2e811f988e1d README.html --- a/README.html Mon Apr 01 07:56:36 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ - - - - - - -README.html - - - - - -

RepeatExplorer utilities

This repository include utilities for preprocessing of NGS data to suitable format for RepeatExplorer and TAREAN -analysis. Each tool include also XML file which define tool interface for Galaxy environment

Available tools

Paired fastq reads filtering and interlacing

tool definition file: paired_fastq_filtering.xml

This tool is designed to make memory efficient preprocessing of two fastq files. Output of this file can be used as input of RepeatExplorer clustering. Input files can be in GNU zipped archive (.gz extension). Reads are filtered based on the quality, presence of N bases and adapters. Two input fastq files are procesed in parallel. Only complete pair are kept. As the input files are process in chunks, it is required that pair reads are complete and in the same order in both input files. All reads which pass the quality filter fill be writen into output files. If sampling is specified, only sample of sequences will be returned. Cutadapt us run with this options:

--anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ---anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' ---anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' ---anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ---anywhere='CAAGCAGAAGACGGCATACGAGAT' ---anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ---error-rate=0.05 ---times=1 --overlap=15 --discard

Order of fastq files processing

Trimming (optional)
Filter by quality
Discard single reads, keep complete pairs
Cutadapt filtering
Discard single reads, keep complete pairs
Sampling (optional)
Interlacing two fasta files

single fastq reads filtering

tool definition file: single_fastq_filtering.xml

This tool is designed to perform preprocessing -of fastq file. Input files can be in GNU zipped archive (.gz extension). Reads -are filtered based on the quality, presence of N bases and adapters. All reads -which pass the quality filter fill be writen into output files. If sampling is -specified, only sample of sequences will be returned.

fasta afixer

tool definition file: fasta_affixer.xml

Tool for appending prefix and suffix to sequences names in fasta formated sequences. This tool is useful -if you want to do comparative analysis with RepeatExplorer and need to -append sample codes to sequence identifiers

Dependencies

R programming environment with installed packages optparse and ShortRead (Bioconductor) -python3 -cutadapt

License

Copyright (c) 2012 Petr Novak (petr@umbr.cas.cz), Jiri Macas and Pavel Neumann, -Laboratory of Molecular Cytogenetics(http://w3lamc.umbr.cas.cz/lamc/) -Institute of Plant Molecular Biology, Biology Centre AS CR, Ceske Budejovice, Czech Republic

This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version.

This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. -You should have received a copy of the GNU General Public License -along with this program. If not, see http://www.gnu.org/licenses/.

- - diff -r a4cd8608ef6b -r 2e811f988e1d RM_custom_search.py.bak --- a/RM_custom_search.py.bak Mon Apr 01 07:56:36 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -#!/usr/bin/env python -''' RepeatMasker search against custom database -input: -- archive with sequencing data -- custom repeat database -''' -import zipfile -import shutil -import os -import subprocess -import parallel -import glob - -def extract_sequences(f): - # check archive: - try: - z=zipfile.ZipFile(f) - # extract only dirCLXXXX/reads.fas - seq_list = [] - for filein in z.namelist(): - if filein.lower().startswith("seqclust/clustering/clusters/dir_cl") and filein.endswith("reads.fas"): - outdir = filein.split("/")[3] - outfile = outdir +"/reads.fas" - source = z.open(filein) - os.mkdir(outdir) - target = file(outfile, "wb") - shutil.copyfileobj(source, target) - seq_list.append(outfile) - if len(seq_list) == 0: - raise ValueError() - - except zipfile.BadZipfile as e: - print "unable to extract sequences from archive!" - raise e - - except IOError as e: - print "unable to extract sequences from archive!" - raise e - - except ValueError as e: - print "No sequences found in archive!" - raise e - - seq_list.sort() - return seq_list - -def get_RM_dir(config_file,galaxy_dir): - shutil.copy(config_file,"seqclust.config") - f = open("seqclust.config",'a') - f.write("\necho $REPEAT_MASKER") - f.close() - args = ["bash", "seqclust.config"] - p = subprocess.Popen(args,stdout = subprocess.PIPE) - RMdir = "{0}{1}".format(galaxy_dir,p.stdout.readline().strip()) - return RMdir - -def RepeatMasker(RM,reads,database): - args = [RM, reads, "-q", "-lib", database, "-pa", "1" , "-nolow", "-dir", os.path.dirname(reads)] - status=subprocess.call(args , stderr = open(os.devnull, 'wb')) - return status - -def summarizeRepeatMaskerOutput(htmlout = "summary.html"): - cmd = os.path.dirname(os.path.abspath(__file__))+"/rmsk_summary_table_multiple.r" - args = [ cmd, "-f", "dir_CL*/reads.fas", "-r", "dir_CL*/reads.fas.out", "-o", "RM-custom_output_table" ] - status=subprocess.call(args) - cmd = cmd = os.path.dirname(os.path.abspath(__file__))+"/RM_html_report.R" - args = [cmd, htmlout] - status=subprocess.call(args) - return status - - -def main(): - from optparse import OptionParser - - parser = OptionParser() - parser.add_option("-i", "--input_file", dest="input_file", help="seqclust zip archive") - parser.add_option("-d", "--database", dest="database", help="custom repeatmasker database") - parser.add_option("-g", "--galaxy_dir", dest="galaxy_dir", help="Galaxy home directory") - parser.add_option("-r", "--report", dest="report", help="output html file with report summary",default='report.html') - - options, args = parser.parse_args() - config_file = os.path.dirname(os.path.abspath(__file__))+"/seqclust.config" - - - seq_files = extract_sequences(options.input_file) ### REMOVE - TESTING - RMdir = get_RM_dir(config_file, options.galaxy_dir) - parallel.parallel(RepeatMasker, [RMdir+"/RepeatMasker"], seq_files, [options.database]) - - status = summarizeRepeatMaskerOutput(options.report) - - - -if __name__== "__main__": - main() - - diff -r a4cd8608ef6b -r 2e811f988e1d __init__.py diff -r a4cd8608ef6b -r 2e811f988e1d extract_contigs_from_archive.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_contigs_from_archive.xml Wed Jun 12 08:19:09 2019 -0400 @@ -0,0 +1,13 @@ + + + unzip -p ${RepeatExplorer_archive} contigs.fasta > ${contigs} + + + + + + + + + +