Mercurial > repos > petr-novak > profrep

diff profrep.py @ 0:a5f1638b73be draft
Uploaded
author: petr-novak
date: Wed, 26 Jun 2019 08:01:42 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/profrep.py	Wed Jun 26 08:01:42 2019 -0400
@@ -0,0 +1,1249 @@
+#!/usr/bin/env python3
+
+import subprocess
+import csv
+import time
+import sys
+import matplotlib
+# matplotlib.use("PDF")
+matplotlib.use("pdf")
+import matplotlib.pyplot as plt
+import matplotlib.colors as colors
+import matplotlib.cm as cmx
+import multiprocessing
+import argparse
+import os
+from functools import partial
+from multiprocessing import Pool
+from tempfile import NamedTemporaryFile
+from operator import itemgetter
+from itertools import groupby
+import gff
+import configuration
+import visualization
+import distutils
+from distutils import dir_util
+import tempfile
+import re
+from Bio import SeqIO
+import sys
+import pickle
+import shutil
+import warnings
+import random
+import numpy as np
+import dante_gff_output_filtering as domains_filtering
+import dante as protein_domains
+
+t_profrep = time.time()
+np.set_printoptions(threshold=sys.maxsize)
+warnings.filterwarnings("ignore", module="matplotlib")
+
+
+class Range():
+    '''
+    This class is used to check float range in argparse
+    '''
+
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def __eq__(self, other):
+        return self.start <= other <= self.end
+
+    def __str__(self):
+        return "float range {}..{}".format(self.start, self.end)
+
+    def __repr__(self):
+        return "float range {}..{}".format(self.start, self.end)
+
+
+def get_version(path):
+    branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
+                                     shell=True,
+                                     cwd=path).decode('ascii').strip()
+    shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1  ",
+                                        shell=True,
+                                        cwd=path).decode('ascii').strip()
+    revcount = len(subprocess.check_output("git log --oneline",
+                                           shell=True,
+                                           cwd=path).decode('ascii').split())
+    version_string = ("-------------------------------------"
+                      "-------------------------------------\n"
+                      "PIPELINE VERSION         : "
+                      "{branch}-rv-{revcount}({shorthash})\n"
+                      "-------------------------------------"
+                      "-------------------------------------\n").format(
+                          branch=branch,
+                          shorthash=shorthash,
+                          revcount=revcount, )
+    return (version_string)
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected')
+
+
+def check_fasta_id(QUERY):
+    forbidden_ids = []
+    headers = []
+    for record in SeqIO.parse(QUERY, "fasta"):
+        if any(x in record.id for x in configuration.FORBIDDEN_CHARS):
+            forbidden_ids.append(record.id)
+        headers.append(record.id)
+    if len(headers) > len(set([header.split(" ")[0] for header in headers])):
+        raise NameError(
+            '''Sequences in multifasta format are not named correctly:
+							seq IDs(before the first space) are the same''')
+    return forbidden_ids, headers
+
+
+def multifasta(QUERY):
+    ''' Create single fasta temporary files to be processed sequentially '''
+    PATTERN = ">"
+    fasta_list = []
+    with open(QUERY, "r") as fasta:
+        reader = fasta.read()
+        splitter = reader.split(PATTERN)[1:]
+        for fasta_num, part in enumerate(splitter):
+            ntf = NamedTemporaryFile(delete=False)
+            ntf.write("{}{}".format(PATTERN, part).encode("utf-8"))
+            fasta_list.append(ntf.name)
+            ntf.close()
+        return fasta_list
+
+
+def fasta_read(subfasta):
+    ''' Read fasta, gain header and sequence without gaps '''
+    sequence_lines = []
+    with open(subfasta, "r") as fasta:
+        header = fasta.readline().strip().split(" ")[0][1:]
+        for line in fasta:
+            clean_line = line.strip()
+            if clean_line:
+                sequence_lines.append(clean_line)
+    sequence = "".join(sequence_lines)
+    return header, sequence
+
+
+def cluster_annotation(CL_ANNOTATION_TBL):
+    ''' Create dictionary of known annotations classes and related clusters '''
+    cl_annotations = {}
+    annot_table = np.genfromtxt(CL_ANNOTATION_TBL, dtype=str)
+    for line in annot_table:
+        if line[1] in cl_annotations:
+            cl_annotations[line[1]].append(line[0])
+        else:
+            cl_annotations[line[1]] = [line[0]]
+    return list(cl_annotations.items()), list(cl_annotations.keys())
+
+
+def read_annotation(CLS, cl_annotations_items):
+    ''' Dictionary of known repeat classes and related reads '''
+    reads_annotations = {}
+    with open(CLS, "r") as cls_file:
+        count = 0
+        for line in cls_file:
+            line = line.rstrip()
+            count += 1
+            if count % 2 == 0:
+                reads = re.split("\s+", line)
+                for element in reads:
+                    for key, value in cl_annotations_items:
+                        if clust in value:
+                            reads_annotations[element] = key
+            else:
+                clust = re.split("\s+", line)[0].split(">CL")[1]
+    return reads_annotations
+
+
+def annot_profile(annotation_keys, part):
+    ''' Predefine dictionary of known annotations and partial sequence 
+	repetitive profiles defined by parallel process '''
+    subprofile = {}
+    for key in annotation_keys:
+        subprofile[key] = [np.zeros(part, dtype=int), np.zeros(part,
+                                                               dtype=int)]
+    subprofile["ALL"] = [np.zeros(part, dtype=int), np.zeros(part, dtype=int)]
+    return subprofile
+
+
+def parallel_process(WINDOW, OVERLAP, seq_length, annotation_keys,
+                     reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
+                     BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER,
+                     last_index, subsets_num, subset_index):
+    ''' Run parallel function to process the input sequence in windows
+		Run blast for subsequence defined by the input index and window size
+ 		Create and increment subprofile vector based on reads aligned within window '''
+    loc_start = subset_index + 1
+    loc_end = subset_index + WINDOW
+    if loc_end >= seq_length:
+        loc_end = seq_length
+        subprofile = annot_profile(annotation_keys, seq_length - loc_start + 1)
+    else:
+        subprofile = annot_profile(annotation_keys, WINDOW + 1)
+
+    # Find HSP records for every window defined by query location and parse the tabular stdout:
+    # 1. query, 2. database read, 3. alignment start, 4. alignment end, 5. bitscore
+    p = subprocess.Popen(
+        "blastn -query {} -query_loc {}-{} -db {} -evalue {} -word_size {} -dust {} -task {} -num_alignments {} -outfmt '6 qseqid sseqid qstart qend bitscore pident'".format(
+            subfasta, loc_start, loc_end, BLAST_DB, E_VALUE, WORD_SIZE,
+            DUST_FILTER, BLAST_TASK, MAX_ALIGNMENTS),
+        stdout=subprocess.PIPE,
+        shell=True)
+    count_hits = 0
+    for line in p.stdout:
+        column = line.decode("utf-8").rstrip().split("\t")
+        if float(column[4]) >= BITSCORE:
+            count_hits += 1
+            read = column[1]  # ID of individual aligned read
+            if "reduce" in read:
+                reads_representation = int(read.split("reduce")[-1])
+            else:
+                reads_representation = 1
+            qstart = int(column[2])  # starting position of alignment
+            qend = int(column[3])  # ending position of alignemnt
+            if read in reads_annotations:
+                annotation = reads_annotations[read]
+            else:
+                annotation = "ALL"
+            subprofile[annotation][0][
+                qstart - subset_index - 1:qend - subset_index] = subprofile[
+                    annotation][0][qstart - subset_index - 1:qend -
+                                   subset_index] + reads_representation
+            subprofile[annotation][1][qstart - subset_index - 1:qend -
+                                      subset_index] = subprofile[annotation][
+                                          1][qstart - subset_index - 1:qend -
+                                             subset_index] + float(column[
+                                                 5]) * reads_representation
+    subprofile["ALL"][0] = sum([item[0] for item in subprofile.values()])
+    subprofile["ALL"][1] = sum([item[1] for item in subprofile.values()])
+    for repeat in subprofile.keys():
+        subprofile[repeat][1] = [int(round(quality / hits_num))
+                                 if hits_num != 0 else quality
+                                 for hits_num, quality in zip(subprofile[
+                                     repeat][0], subprofile[repeat][1])]
+    if subset_index == 0:
+        if subsets_num == 1:
+            subprf_name = subprofile_single(subprofile, subset_index)
+        else:
+            subprf_name = subprofile_first(subprofile, subset_index, WINDOW,
+                                           OVERLAP)
+    elif subset_index == last_index:
+        subprf_name = subprofile_last(subprofile, subset_index, OVERLAP)
+    else:
+        subprf_name = subprofiles_middle(subprofile, subset_index, WINDOW,
+                                         OVERLAP)
+    return subprf_name
+
+
+def subprofile_single(subprofile, subset_index):
+    subprofile['idx'] = list(range(1, len(subprofile["ALL"][0]) + 1))
+    subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
+                                     delete=False)
+    with open(subprf_dict.name, 'wb') as handle:
+        pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    subprf_dict.close()
+    return subprf_dict.name
+
+
+def subprofile_first(subprofile, subset_index, WINDOW, OVERLAP):
+    for key in subprofile.keys():
+        subprofile[key][0] = subprofile[key][0][0:-OVERLAP // 2 - 1]
+        subprofile[key][1] = subprofile[key][1][0:-OVERLAP // 2 - 1]
+    subprofile['idx'] = list(range(subset_index + 1, subset_index + WINDOW -
+                                   OVERLAP // 2 + 1))
+    subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
+                                     delete=False)
+    with open(subprf_dict.name, 'wb') as handle:
+        pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    subprf_dict.close()
+    return subprf_dict.name
+
+
+def subprofiles_middle(subprofile, subset_index, WINDOW, OVERLAP):
+    for key in subprofile.keys():
+        subprofile[key][0] = subprofile[key][0][OVERLAP // 2:-OVERLAP // 2 - 1]
+        subprofile[key][1] = subprofile[key][1][OVERLAP // 2:-OVERLAP // 2 - 1]
+    subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
+                                   subset_index + WINDOW - OVERLAP // 2 + 1))
+    subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
+                                     delete=False)
+    with open(subprf_dict.name, 'wb') as handle:
+        pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    subprf_dict.close()
+    return subprf_dict.name
+
+
+def subprofile_last(subprofile, subset_index, OVERLAP):
+    len_subprofile = len(subprofile['ALL'][0])
+    for key in subprofile.keys():
+        subprofile[key][0] = subprofile[key][0][OVERLAP // 2:]
+        subprofile[key][1] = subprofile[key][1][OVERLAP // 2:]
+    subprofile['idx'] = list(range(subset_index + OVERLAP // 2 + 1,
+                                   subset_index + len_subprofile + 1))
+    subprf_dict = NamedTemporaryFile(suffix='{}_.pickle'.format(subset_index),
+                                     delete=False)
+    with open(subprf_dict.name, 'wb') as handle:
+        pickle.dump(subprofile, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    subprf_dict.close()
+    return subprf_dict.name
+
+
+def concatenate_prof(subprofiles_all, files_dict, seq_id, HTML_DATA,
+                     wig_files):
+    for subprofile in subprofiles_all:
+        with open(subprofile, 'rb') as handle:
+            individual_dict = pickle.load(handle)
+            exclude = set(["idx"])
+            for key in set(individual_dict.keys()).difference(exclude):
+                if any(individual_dict[key][0]):
+                    indices = handle_zero_lines(individual_dict[key][0])
+                    if key not in files_dict.keys():
+                        prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
+                            '[\/\|]', '_', key))
+                        prf_qual_name = "{}/{}_qual.wig".format(
+                            HTML_DATA, re.sub('[\/\|]', '_', key))
+                        prf_file = open(prf_name, "w")
+                        prf_q_file = open(prf_qual_name, "w")
+                        prf_file.write("{}{}\n".format(
+                            configuration.HEADER_WIG, seq_id))
+                        prf_q_file.write("{}{}\n".format(
+                            configuration.HEADER_WIG, seq_id))
+                        for i in indices:
+                            prf_file.write("{}\t{}\n".format(individual_dict[
+                                'idx'][i], individual_dict[key][0][i]))
+                            prf_q_file.write("{}\t{}\n".format(individual_dict[
+                                'idx'][i], int(individual_dict[key][1][i])))
+                        files_dict[key] = [prf_name, [seq_id], prf_qual_name]
+                        wig_files.append(prf_file)
+                        wig_files.append(prf_q_file)
+                        prf_file.close()
+                        prf_q_file.close()
+                    else:
+                        prf_name = files_dict[key][0]
+                        prf_qual_name = files_dict[key][2]
+                        with open(prf_name, "a") as prf_file, open(
+                                prf_qual_name, "a") as prf_q_file:
+                            if seq_id not in files_dict[key][1]:
+                                prf_file.write("{}{}\n".format(
+                                    configuration.HEADER_WIG, seq_id))
+                                prf_q_file.write("{}{}\n".format(
+                                    configuration.HEADER_WIG, seq_id))
+                                files_dict[key][1].append(seq_id)
+                            for i in indices:
+                                prf_file.write("{}\t{}\n".format(
+                                    individual_dict['idx'][i], individual_dict[
+                                        key][0][i]))
+                                prf_q_file.write("{}\t{}\n".format(
+                                    individual_dict['idx'][i], int(
+                                        individual_dict[key][1][i])))
+    return files_dict, wig_files
+
+
+def concatenate_prof_CN(CV, subprofiles_all, files_dict, seq_id, HTML_DATA,
+                        wig_files):
+    for subprofile in subprofiles_all:
+        with open(subprofile, 'rb') as handle:
+            individual_dict = pickle.load(handle)
+            exclude = set(["idx"])
+            for key in set(individual_dict.keys()).difference(exclude):
+                if any(individual_dict[key][0]):
+                    indices = handle_zero_lines(individual_dict[key][0])
+                    if key not in files_dict.keys():
+                        prf_name = "{}/{}.wig".format(HTML_DATA, re.sub(
+                            '[\/\|]', '_', key))
+                        prf_qual_name = "{}/{}_qual.wig".format(
+                            HTML_DATA, re.sub('[\/\|]', '_', key))
+                        prf_file = open(prf_name, "w")
+                        prf_q_file = open(prf_qual_name, "w")
+                        prf_file.write("{}{}\n".format(
+                            configuration.HEADER_WIG, seq_id))
+                        prf_q_file.write("{}{}\n".format(
+                            configuration.HEADER_WIG, seq_id))
+                        for i in indices:
+                            prf_file.write("{}\t{}\n".format(individual_dict[
+                                'idx'][i], int(individual_dict[key][0][i] /
+                                               CV)))
+                            prf_q_file.write("{}\t{}\n".format(individual_dict[
+                                'idx'][i], int(individual_dict[key][1][i])))
+                        files_dict[key] = [prf_name, [seq_id], prf_qual_name]
+                        wig_files.append(prf_file)
+                        wig_files.append(prf_q_file)
+                        prf_file.close()
+                        prf_q_file.close()
+                    else:
+                        prf_name = files_dict[key][0]
+                        prf_qual_name = files_dict[key][2]
+                        with open(prf_name, "a") as prf_file, open(
+                                prf_qual_name, "a") as prf_q_file:
+                            if seq_id not in files_dict[key][1]:
+                                prf_file.write("{}{}\n".format(
+                                    configuration.HEADER_WIG, seq_id))
+                                prf_q_file.write("{}{}\n".format(
+                                    configuration.HEADER_WIG, seq_id))
+                                files_dict[key][1].append(seq_id)
+                            for i in indices:
+                                prf_file.write("{}\t{}\n".format(
+                                    individual_dict['idx'][i], int(
+                                        individual_dict[key][0][i] / CV)))
+                                prf_q_file.write("{}\t{}\n".format(
+                                    individual_dict['idx'][i], int(
+                                        individual_dict[key][1][i])))
+    return files_dict, wig_files
+
+
+def handle_zero_lines(repeat_subhits):
+    ''' Clean lines which contains only zeros, i.e. positons which do not contain any hit. However border zero positions need to be preserved due to correct graphs plotting '''
+    zero_idx = [idx for idx, val in enumerate(repeat_subhits) if val == 0]
+    indices = [idx for idx, val in enumerate(repeat_subhits) if val != 0]
+    zero_breakpoints = []
+    for key, group in groupby(
+            enumerate(zero_idx),
+            lambda index_item: index_item[0] - index_item[1]):
+        group = list(map(itemgetter(1), group))
+        zero_breakpoints.append(group[0])
+        zero_breakpoints.append(group[-1])
+    if indices:
+        indices.extend(zero_breakpoints)
+        indices = sorted(set(indices), key=int)
+    else:
+        indices = []
+    return indices
+
+
+def repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
+                        xminimal, xmaximal, domains, seq_ids_dom, CN,
+                        seq_ids_all, seq_lengths_all, files_dict):
+    ''' Process the hits table separately for each fasta, create gff file and profile picture '''
+    if files_dict:
+        gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
+                       seq_ids_all)
+    else:
+        with open(OUTPUT_GFF, "w") as gff_file:
+            gff_file.write("{}\n".format(configuration.HEADER_GFF))
+
+    # TODO remove plotting, keep only basic report
+    return None
+    seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
+    graphs_dict = {}
+    seqs_long = []
+    if files_dict:
+        [graphs_dict, seqs_long] = visualization.vis_profrep(
+            seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
+            seqs_all_part)
+    count_seq = 0
+    for seq in seqs_all_part:
+        if seq in graphs_dict.keys():
+            fig = graphs_dict[seq][0]
+            ax = graphs_dict[seq][1]
+            art = []
+            lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
+            art.append(lgd)
+            if seq in seq_ids_dom:
+                dom_idx = seq_ids_dom.index(seq)
+                [fig, ax] = visualization.vis_domains(
+                    fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
+                    domains[dom_idx])
+        elif seq in seqs_long:
+            [fig, ax] = visualization.plot_figure(
+                seq, seq_lengths_all[count_seq], CN)
+            ax.text(
+                0.3,
+                0.5,
+                "Graphs are only displayed if sequence is not longer than {} bp".format(
+                    configuration.SEQ_LEN_VIZ),
+                transform=ax.transAxes,
+                fontsize=14,
+                verticalalignment='center',
+                color='blue')
+        else:
+            [fig, ax] = visualization.plot_figure(
+                seq, seq_lengths_all[count_seq], CN)
+            ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
+            if seq in seq_ids_dom:
+                dom_idx = seq_ids_dom.index(seq)
+                [fig, ax] = visualization.vis_domains(
+                    fig, ax, seq, xminimal[dom_idx], xmaximal[dom_idx],
+                    domains[dom_idx])
+        output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
+        fig.savefig(output_pic_png,
+                    bbox_inches="tight",
+                    format="png",
+                    dpi=configuration.IMAGE_RES)
+        count_seq += 1
+    return None
+
+
+def repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA, CN,
+                    seq_ids_all, seq_lengths_all, files_dict):
+    ''' Process the hits table separately for each fasta, create gff file and profile picture '''
+    if files_dict:
+        gff.create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict,
+                       seq_ids_all)
+    else:
+        with open(OUTPUT_GFF, "w") as gff_file:
+            gff_file.write("{}\n".format(configuration.HEADER_GFF))
+
+    # TODO remove plotting, keep only basic report
+    return None
+    seqs_all_part = seq_ids_all[0:configuration.MAX_PIC_NUM]
+    graphs_dict = {}
+    seqs_long = []
+    if files_dict:
+        [graphs_dict, seqs_long] = visualization.vis_profrep(
+            seq_ids_all, files_dict, seq_lengths_all, CN, HTML_DATA,
+            seqs_all_part)
+    count_seq = 0
+    for seq in seqs_all_part:
+        if seq in graphs_dict.keys():
+            fig = graphs_dict[seq][0]
+            ax = graphs_dict[seq][1]
+            art = []
+            lgd = ax.legend(bbox_to_anchor=(0.5, -0.1), loc=9, ncol=3)
+            art.append(lgd)
+        elif seq in seqs_long:
+            [fig, ax] = visualization.plot_figure(
+                seq, seq_lengths_all[count_seq], CN)
+            ax.text(
+                0.3,
+                0.5,
+                "Graphs are only displayed if sequence is not longer than {} bp".format(
+                    configuration.SEQ_LEN_VIZ),
+                transform=ax.transAxes,
+                fontsize=14,
+                verticalalignment='center',
+                color='blue')
+        else:
+            [fig, ax] = visualization.plot_figure(
+                seq, seq_lengths_all[count_seq], CN)
+            ax.hlines(0, 0, seq_lengths_all[count_seq], color="red", lw=4)
+        output_pic_png = "{}/{}.png".format(HTML_DATA, count_seq)
+        fig.savefig(output_pic_png,
+                    bbox_inches="tight",
+                    format="png",
+                    dpi=configuration.IMAGE_RES)
+        plt.close()
+        count_seq += 1
+    return None
+
+
+def html_output(total_length, seq_lengths_all, seq_names, HTML, DB_NAME, REF,
+                REF_LINK):
+    ''' Define html output with limited number of output pictures and link to JBrowse '''
+    info = "\t\t".join(['<pre> {} [{} bp]</pre>'.format(seq_name, seq_length)
+                        for seq_name, seq_length in zip(seq_names,
+                                                        seq_lengths_all)])
+    if REF:
+        ref_part_1 = REF.split("-")[0]
+        ref_part_2 = "-".join(REF.split("-")[1:]).split(". ")[0]
+        ref_part_3 = ". ".join("-".join(REF.split("-")[1:]).split(". ")[1:])
+        ref_string = '''<h6> {} - <a href="{}" target="_blank" >{}</a>. {}'''.format(
+            ref_part_1, REF_LINK, ref_part_2, ref_part_3)
+    else:
+        ref_string = "Custom Data"
+    pictures = "\n\t\t".join(['<img src="{}.png" width=1800>'.format(
+        pic) for pic in range(len(seq_names))[:configuration.MAX_PIC_NUM]])
+    html_str = configuration.HTML_STR.format(info, total_length, DB_NAME,
+                                             pictures, ref_string)
+    with open(HTML, "w") as html_file:
+        html_file.write(html_str)
+
+
+def adjust_tracklist(jbrowse_data_path):
+    starting_lines = []
+    ending_lines = []
+    end = False
+    with open(
+            os.path.join(jbrowse_data_path,
+                         "trackList.json"), "r") as track_list:
+        for line in track_list:
+            if "]" not in line and not end:
+                starting_lines.append(line)
+            else:
+                end = True
+                ending_lines.append(line)
+    with open(
+            os.path.join(jbrowse_data_path,
+                         "trackList.json"), "w") as track_list:
+        for line in starting_lines:
+            track_list.write(line)
+    return ending_lines
+
+
+def jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
+                     total_length, JBROWSE_BIN, files_dict):
+    ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
+    jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
+    with tempfile.TemporaryDirectory() as dirpath:
+        subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
+                         "--fasta", QUERY, "--out", jbrowse_data_path])
+        subprocess.call(["{}/flatfile-to-json.pl".format(
+            JBROWSE_BIN), "--gff", OUT_DOMAIN_GFF, "--trackLabel",
+                         "GFF_domains", "--out", jbrowse_data_path])
+        subprocess.call(["{}/flatfile-to-json.pl".format(
+            JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
+                         "--config", configuration.JSON_CONF_R, "--out",
+                         jbrowse_data_path])
+        subprocess.call(["{}/flatfile-to-json.pl".format(
+            JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
+                         "--config", configuration.JSON_CONF_N, "--out",
+                         jbrowse_data_path])
+        count = 0
+        # Control the total length processed, if above threshold, dont create wig image tracks
+        if files_dict:
+            exclude = set(['ALL'])
+            sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
+            sorted_keys.insert(0, "ALL")
+            ending_lines = adjust_tracklist(jbrowse_data_path)
+            track_list = open(
+                os.path.join(jbrowse_data_path, "trackList.json"), "a")
+            color_avail = len(configuration.COLORS_HEX)
+            for repeat_id in sorted_keys:
+                if count <= color_avail - 1:
+                    color = configuration.COLORS_HEX[count]
+                else:
+                    r = lambda: random.randint(0, 255)
+                    color = '#%02X%02X%02X' % (r(), r(), r())
+                count += 1
+                bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
+                subprocess.call(["wigToBigWig", files_dict[repeat_id][
+                    0], os.path.join(HTML_DATA,
+                                     configuration.CHROM_SIZES_FILE),
+                                 os.path.join(jbrowse_data_path, bw_name)])
+                track_list.write(configuration.TRACK_LIST.format(
+                    "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
+            for line in ending_lines:
+                track_list.write(line)
+        distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
+    return None
+
+
+def jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
+                 JBROWSE_BIN, files_dict):
+    ''' Set up the paths, link and convert output data to be displayed as tracks in Jbrowse '''
+    jbrowse_data_path = os.path.join(HTML_DATA, configuration.jbrowse_data_dir)
+    with tempfile.TemporaryDirectory() as dirpath:
+        subprocess.call(["{}/prepare-refseqs.pl".format(JBROWSE_BIN),
+                         "--fasta", QUERY, "--out", jbrowse_data_path])
+        subprocess.call(["{}/flatfile-to-json.pl".format(
+            JBROWSE_BIN), "--gff", OUTPUT_GFF, "--trackLabel", "GFF_repeats",
+                         "--config", configuration.JSON_CONF_R, "--out",
+                         jbrowse_data_path])
+        subprocess.call(["{}/flatfile-to-json.pl".format(
+            JBROWSE_BIN), "--gff", N_GFF, "--trackLabel", "N_regions",
+                         "--config", configuration.JSON_CONF_N, "--out",
+                         jbrowse_data_path])
+        count = 0
+        ## Control the total length processed, if above threshold, dont create wig image tracks
+        if files_dict:
+            exclude = set(['ALL'])
+            sorted_keys = sorted(set(files_dict.keys()).difference(exclude))
+            sorted_keys.insert(0, "ALL")
+            ending_lines = adjust_tracklist(jbrowse_data_path)
+            track_list = open(
+                os.path.join(jbrowse_data_path, "trackList.json"), "a")
+            color_avail = len(configuration.COLORS_HEX)
+            for repeat_id in sorted_keys:
+                if count <= color_avail - 1:
+                    color = configuration.COLORS_HEX[count]
+                else:
+                    r = lambda: random.randint(0, 255)
+                    color = '#%02X%02X%02X' % (r(), r(), r())
+                count += 1
+                bw_name = "{}.bw".format(re.sub('[\/\|]', '_', repeat_id))
+                subprocess.call(["wigToBigWig", files_dict[repeat_id][
+                    0], os.path.join(HTML_DATA,
+                                     configuration.CHROM_SIZES_FILE),
+                                 os.path.join(jbrowse_data_path, bw_name)])
+                track_list.write(configuration.TRACK_LIST.format(
+                    "{", bw_name, repeat_id, repeat_id, "{", color, "}", "}"))
+            for line in ending_lines:
+                track_list.write(line)
+            track_list.close()
+        distutils.dir_util.copy_tree(dirpath, jbrowse_data_path)
+    return None
+
+
+def genome2coverage(GS, BLAST_DB):
+    ''' Convert genome size to coverage '''
+    num_of_reads = 0
+    with open(BLAST_DB, "r") as reads_all:
+        first_line = reads_all.readline()
+        if first_line.startswith(">"):
+            num_of_reads += 1
+            first_seq = reads_all.readline().rstrip()
+        for line in reads_all:
+            if line.startswith(">"):
+                num_of_reads += 1
+    len_of_read = len(first_seq)
+    CV = (num_of_reads * len_of_read) / (GS * 1000000)  # GS in Mb
+    return CV
+
+
+def prepared_data(DB_ID):
+    ''' Get prepared rep. annotation data from the table based on the selected species ID '''
+    tbl = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        configuration.PROFREP_DATA, configuration.PROFREP_TBL)
+    with open(tbl, "r") as datasets:
+        for line in datasets:
+            if line.split("\t")[0] == DB_ID:
+                DB_NAME = line.split("\t")[1]
+                CV = float(line.split("\t")[5])
+                REF = line.split("\t")[6]
+                REF_LINK = line.split("\t")[7]
+    return DB_NAME, CV, REF, REF_LINK
+
+
+def seq_sizes_file(seq_ids, seq_lengths_all, HTML_DATA):
+    chrom_sizes = os.path.join(HTML_DATA, configuration.CHROM_SIZES_FILE)
+    with open(chrom_sizes, "w") as chroms:
+        for seq_id, seq_length in zip(seq_ids, seq_lengths_all):
+            chroms.write("{}\t{}\n".format(seq_id, seq_length))
+
+
+def main(args):
+	## Command line arguments
+    QUERY = args.query
+    BLAST_DB = args.reads
+    CL_ANNOTATION_TBL = args.ann_tbl 
+    CLS = args.cls
+    BITSCORE = args.bit_score
+    E_VALUE = args.e_value
+    WORD_SIZE = args.word_size
+    WINDOW = args.window
+    OVERLAP = args.overlap
+    BLAST_TASK = args.task
+    MAX_ALIGNMENTS = args.max_alignments
+    NEW_DB = args.new_db
+    THRESHOLD = args.threshold_repeat
+    THRESHOLD_SEGMENT = args.threshold_segment
+    TH_IDENTITY = args.th_identity
+    TH_LENGTH = args.th_length 
+    TH_INTERRUPT = args.interruptions
+    TH_SIMILARITY = args.th_similarity
+    TH_LEN_RATIO = args.max_len_proportion
+    OUTPUT_GFF = args.output_gff
+    DOMAINS = args.protein_domains
+    LAST_DB = args.protein_database
+    CLASSIFICATION = args.classification
+    OUT_DOMAIN_GFF = args.domain_gff
+    HTML = args.html_file
+    HTML_DATA = args.html_path
+    N_GFF = args.n_gff
+    CN = args.copy_numbers
+    GS = args.genome_size
+    DB_ID = args.db_id
+    THRESHOLD_SCORE = args.threshold_score
+    WIN_DOM = args.win_dom
+    OVERLAP_DOM = args.overlap_dom
+    JBROWSE_BIN = args.jbrowse_bin
+    DUST_FILTER = args.dust_filter
+    LOG_FILE = args.log_file
+    #JBROWSE_BIN = os.environ['JBROWSE_SOURCE_DIR']+"/bin"
+	#if not JBROWSE_BIN: 
+	#	try:
+	#		JBROWSE_BIN = os.environ['JBROWSE_BIN']
+	#	except KeyError:
+	#		raise ValueError('There was no path to JBrowse bin found - set the enviroment variable JBROWSE_BIN or pass the argument explicitly')
+    if CN and not DB_ID and not GS:
+        raise ValueError("Genome size missing - if you want to convert hits to copy numbers please enter --genome_size parameter")
+
+
+    ## Check if there are forbidden characters in fasta IDs
+    [forbidden_ids, headers] = check_fasta_id(QUERY)
+    if forbidden_ids:
+        ##################### USER ERROR ###############################
+        raise UserWarning(
+            "The following IDs contain forbidden characters ('/' or '\\') - PLEASE REPLACE OR DELETE THEM:\n{}".format(
+                "\n".join(forbidden_ids)))
+
+    if len(headers) > len(set([header.split(" ")[0] for header in headers])):
+        raise NameError(
+            '''Sequences in multifasta format are not named correctly:
+				seq IDs(before the first space) are the same''')
+
+    ## Create new blast database of reads
+    if NEW_DB:
+        subprocess.call("makeblastdb -in {} -dbtype nucl".format(BLAST_DB),
+                        shell=True)
+
+    ## Parse prepared annotation data table
+    if DB_ID:
+        [DB_NAME, CV, REF, REF_LINK] = prepared_data(DB_ID)
+    else:
+        REF = None
+        REF_LINK = None
+        DB_NAME = "CUSTOM"
+
+    ## Create dir to store outputs for html and JBROWSE
+    if not os.path.exists(HTML_DATA):
+        os.makedirs(HTML_DATA)
+
+    if not os.path.isabs(HTML):
+        HTML = os.path.join(HTML_DATA, HTML)
+
+    if not os.path.isabs(OUT_DOMAIN_GFF):
+        OUT_DOMAIN_GFF = os.path.join(HTML_DATA, OUT_DOMAIN_GFF)
+
+    if not os.path.isabs(LOG_FILE):
+        LOG_FILE = os.path.join(HTML_DATA, LOG_FILE)
+
+    if not os.path.isabs(N_GFF):
+        N_GFF = os.path.join(HTML_DATA, N_GFF)
+
+    if not os.path.isabs(OUTPUT_GFF):
+        OUTPUT_GFF = os.path.join(HTML_DATA, OUTPUT_GFF)
+
+    path = os.path.dirname(os.path.realpath(__file__))
+    version_string = get_version(path)
+
+    log = os.open(LOG_FILE, os.O_RDWR | os.O_CREAT)
+
+    os.write(log, version_string.encode("utf-8"))
+
+    ## Define parameters for parallel process
+    STEP = WINDOW - OVERLAP
+    NUM_CORES = multiprocessing.cpu_count()
+    os.write(log, "NUM_OF_CORES = {}\n".format(NUM_CORES).encode("utf-8"))
+
+    ## Convert genome size to coverage
+    if CN and GS:
+        CV = genome2coverage(GS, BLAST_DB)
+        os.write(log, "COVERAGE = {}\n".format(CV).encode("utf-8"))
+
+    parallel_pool = Pool(NUM_CORES)
+
+    ## Assign clusters to repetitive classes
+    [cl_annotations_items, annotation_keys
+     ] = cluster_annotation(CL_ANNOTATION_TBL)
+
+    ## Assign reads to repetitive classes
+    reads_annotations = read_annotation(CLS, cl_annotations_items)
+
+    ## Detect all fasta sequences from input
+    fasta_list = multifasta(QUERY)
+    headers = []
+    files_dict = {}
+    wig_files = []
+    seq_count = 1
+    start = 1
+    total_length = 0
+    seq_lengths_all = []
+    Ngff = open(N_GFF, "w")
+    Ngff.write("{}\n".format(configuration.HEADER_GFF))
+
+    ## Find hits for each fasta sequence separetely
+    t_blast = time.time()
+    for subfasta in fasta_list:
+        [header, sequence] = fasta_read(subfasta)
+        os.write(log, "Sequence {} is being processed...\n".format(
+            header).encode("utf-8"))
+        os.fsync(log)
+        indices_N = [indices + 1
+                     for indices, n in enumerate(sequence)
+                     if n == "n" or n == "N"]
+        if indices_N:
+            gff.idx_ranges_N(indices_N, configuration.N_segment, header, Ngff,
+                             configuration.N_NAME, configuration.N_FEATURE)
+        seq_length = len(sequence)
+        headers.append(header)
+        ## Create parallel process
+        subset_index = list(range(0, seq_length, STEP))
+        ## Situation when penultimal window is not complete but it is following by another one
+        if len(subset_index) > 1 and subset_index[-2] + WINDOW >= seq_length:
+            subset_index = subset_index[:-1]
+        last_index = subset_index[-1]
+        index_range = range(len(subset_index))
+        for chunk_index in index_range[0::configuration.MAX_FILES_SUBPROFILES]:
+            multiple_param = partial(
+                parallel_process, WINDOW, OVERLAP, seq_length, annotation_keys,
+                reads_annotations, subfasta, BLAST_DB, E_VALUE, WORD_SIZE,
+                BLAST_TASK, MAX_ALIGNMENTS, BITSCORE, DUST_FILTER, last_index,
+                len(subset_index))
+            subprofiles_all = parallel_pool.map(multiple_param, subset_index[
+                chunk_index:chunk_index + configuration.MAX_FILES_SUBPROFILES])
+            ## Join partial profiles to the final profile of the sequence
+            if CN:
+                [files_dict, wig_files
+                 ] = concatenate_prof_CN(CV, subprofiles_all, files_dict,
+                                         header, HTML_DATA, wig_files)
+            else:
+                [files_dict, wig_files] = concatenate_prof(
+                    subprofiles_all, files_dict, header, HTML_DATA, wig_files)
+            for subprofile in subprofiles_all:
+                os.unlink(subprofile)
+        total_length += seq_length
+        seq_lengths_all.append(seq_length)
+
+    os.write(log, "ELAPSED_TIME_BLAST = {} s\n".format(time.time(
+    ) - t_blast).encode("utf-8"))
+    os.write(
+        log,
+        "TOTAL_LENGHT_ANALYZED = {} bp\n".format(total_length).encode("utf-8"))
+
+    ## Close opened files
+    for opened_file in wig_files:
+        opened_file.close()
+    Ngff.close()
+
+    ## Create file containing size of sequences to convert wig to bigwig
+    seq_sizes_file(headers, seq_lengths_all, HTML_DATA)
+
+    ## Protein domains module
+    t_domains = time.time()
+    if DOMAINS:
+        os.write(log, "Domains module has started...\n".encode("utf-8"))
+        os.fsync(log)
+        domains_primary = NamedTemporaryFile(delete=False)
+        protein_domains.domain_search(QUERY, LAST_DB, CLASSIFICATION,
+                                      domains_primary.name, THRESHOLD_SCORE,
+                                      WIN_DOM, OVERLAP_DOM)
+        domains_primary.close()
+        [xminimal, xmaximal, domains, seq_ids_dom
+         ] = domains_filtering.filter_qual_dom(
+             domains_primary.name, OUT_DOMAIN_GFF, TH_IDENTITY, TH_SIMILARITY,
+             TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, 'All', "")
+        os.unlink(domains_primary.name)
+        os.write(log, "ELAPSED_TIME_DOMAINS = {} s\n".format(time.time(
+        ) - t_domains).encode("utf-8"))
+
+        # Process individual sequences from the input file sequentially
+        t_gff_vis = time.time()
+        repeats_process_dom(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT,
+                            HTML_DATA, xminimal, xmaximal, domains,
+                            seq_ids_dom, CN, headers, seq_lengths_all,
+                            files_dict)
+        os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
+        ) - t_gff_vis).encode("utf-8"))
+        os.fsync(log)
+        # Prepare data for html output
+        t_jbrowse = time.time()
+        os.write(log, "JBrowse tracks are being prepared...\n".encode("utf-8"))
+        os.fsync(log)
+        jbrowse_prep_dom(HTML_DATA, QUERY, OUT_DOMAIN_GFF, OUTPUT_GFF, N_GFF,
+                         total_length, JBROWSE_BIN, files_dict)
+        os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
+        ) - t_jbrowse).encode("utf-8"))
+    else:
+        # Process individual sequences from the input file sequentially
+        t_gff_vis = time.time()
+        repeats_process(OUTPUT_GFF, THRESHOLD, THRESHOLD_SEGMENT, HTML_DATA,
+                        CN, headers, seq_lengths_all, files_dict)
+        os.write(log, "ELAPSED_TIME_GFF_VIS = {} s\n".format(time.time(
+        ) - t_gff_vis).encode("utf-8"))
+
+        # Prepare data for html output
+        t_jbrowse = time.time()
+        jbrowse_prep(HTML_DATA, QUERY, OUTPUT_GFF, N_GFF, total_length,
+                     JBROWSE_BIN, files_dict)
+        os.write(log, "ELAPSED_TIME_JBROWSE_PREP = {} s\n".format(time.time(
+        ) - t_jbrowse).encode("utf-8"))
+
+    # Create HTML output
+    t_html = time.time()
+    os.write(
+        log,
+        "HTML output and JBrowse data structure are being prepared...\n".encode(
+            "utf-8"))
+    os.fsync(log)
+    html_output(total_length, seq_lengths_all, headers, HTML, DB_NAME, REF,
+                REF_LINK)
+    os.write(log, "ELAPSED_TIME_HTML = {} s\n".format(time.time() -
+                                                      t_html).encode("utf-8"))
+    os.write(log, "ELAPSED_TIME_PROFREP = {} s\n".format(time.time(
+    ) - t_profrep).encode("utf-8"))
+    os.close(log)
+
+    ## Clean up the temporary fasta files
+    for subfasta in fasta_list:
+        os.unlink(subfasta)
+
+
+if __name__ == "__main__":
+    import argparse
+    from argparse import RawDescriptionHelpFormatter
+
+    class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
+                          argparse.RawDescriptionHelpFormatter):
+        pass
+
+    # Default paths (command line usage)
+    HTML = configuration.HTML
+    DOMAINS_GFF = configuration.DOMAINS_GFF
+    REPEATS_GFF = configuration.REPEATS_GFF
+    N_GFF = configuration.N_GFF
+    LOG_FILE = configuration.LOG_FILE
+    PROFREP_OUTPUT_DIR = configuration.PROFREP_OUTPUT_DIR
+
+    # Command line arguments
+    parser = argparse.ArgumentParser(
+        description='''
+		
+	DEPENDENCIES:
+		- python 3.4 or higher with packages:
+			- numpy
+			- matplotlib
+			- biopython
+ 		- BLAST 2.2.28+ or higher
+ 		- LAST 744 or higher
+ 		- wigToBigWig
+ 		- cd-hit
+ 		- JBrowse - ! Only bin needed, does not have to be installed under a web server 
+ 		* ProfRep Modules:
+			- gff.py
+			- visualization.py
+			- configuration.py 
+			- protein_domains.py
+			- domains_filtering.py
+
+	EXAMPLE OF USAGE:
+		
+		./protein.py --query PATH_TO_DNA_SEQ --reads PATH_TO_READS --ann_tbl PATH_TO_CLUSTERS_CLASSIFICATION  --cls PATH_TO_hitsort.cls [--new_db True]
+		''',
+        epilog="""take a look at README for more detailed information""",
+        formatter_class=CustomFormatter)
+
+    Required = parser.add_argument_group('required arguments')
+    altRequired = parser.add_argument_group(
+        'alternative required arguments - prepared datasets')
+    blastOpt = parser.add_argument_group('optional arguments - BLAST Search')
+    parallelOpt = parser.add_argument_group(
+        'optional arguments - Parallel Processing')
+    protOpt = parser.add_argument_group('optional arguments - Protein Domains')
+    outOpt = parser.add_argument_group('optional arguments - Output Paths')
+    cnOpt = parser.add_argument_group(
+        'optional arguments - Copy Numbers/Hits ')
+    galaxyOpt = parser.add_argument_group(
+        'optional arguments - Enviroment Variables')
+
+    ################ INPUTS ############################################
+    Required.add_argument('-q',
+                          '--query',
+                          type=str,
+                          required=True,
+                          help='input DNA sequence in (multi)fasta format')
+    Required.add_argument('-rdb',
+                          '--reads',
+                          type=str,
+                          required=True,
+                          help='blast database of all sequencing reads')
+    Required.add_argument(
+        '-a',
+        '--ann_tbl',
+        type=str,
+        required=True,
+        help=
+        'clusters annotation table, tab-separated number of cluster and its classification')
+    Required.add_argument(
+        '-c',
+        '--cls',
+        type=str,
+        required=True,
+        help='cls file containing reads assigned to clusters (hitsort.cls)')
+    altRequired.add_argument(
+        '-id',
+        '--db_id',
+        type=str,
+        help='annotation dataset ID (first column of datasets table)')
+
+    ################ BLAST parameters ##################################
+    blastOpt.add_argument('-bs',
+                          '--bit_score',
+                          type=float,
+                          default=50,
+                          help='bitscore threshold')
+    blastOpt.add_argument(
+        '-m',
+        '--max_alignments',
+        type=int,
+        default=10000000,
+        help=
+        'blast filtering option: maximal number of alignments in the output')
+    blastOpt.add_argument('-e',
+                          '--e_value',
+                          type=str,
+                          default=0.1,
+                          help='blast setting option: e-value')
+    blastOpt.add_argument(
+        '-df',
+        '--dust_filter',
+        type=str,
+        default="'20 64 1'",
+        help='dust filters low-complexity regions during BLAST search')
+    blastOpt.add_argument(
+        '-ws',
+        '--word_size',
+        type=int,
+        default=11,
+        help='blast search option: initial word size for alignment')
+    blastOpt.add_argument('-t',
+                          '--task',
+                          type=str,
+                          default="blastn",
+                          help='type of blast to be triggered')
+    blastOpt.add_argument(
+        '-n',
+        '--new_db',
+        type=str2bool,
+        default=True,
+        help=
+        'create a new blast database, USE THIS OPTION IF YOU RUN PROFREP WITH NEW DATABASE FOR THE FIRST TIME')
+
+    ############### PARALLEL PROCESSING ARGUMENTS ######################
+    parallelOpt.add_argument(
+        '-w',
+        '--window',
+        type=int,
+        default=5000,
+        help='sliding window size for parallel processing')
+    parallelOpt.add_argument(
+        '-o',
+        '--overlap',
+        type=int,
+        default=150,
+        help=
+        'overlap for parallely processed regions, set greater than a read size')
+
+    ################ PROTEIN DOMAINS PARAMETERS ########################
+    protOpt.add_argument('-pd',
+                         '--protein_domains',
+                         type=str2bool,
+                         default=False,
+                         help='use module for protein domains')
+    protOpt.add_argument('-pdb',
+                         '--protein_database',
+                         type=str,
+                         help='protein domains database')
+    protOpt.add_argument('-cs',
+                         '--classification',
+                         type=str,
+                         help='protein domains classification file')
+    protOpt.add_argument(
+        '-wd',
+        '--win_dom',
+        type=int,
+        default=10000000,
+        help=
+        'protein domains module: sliding window to process large input sequences sequentially')
+    protOpt.add_argument(
+        '-od',
+        '--overlap_dom',
+        type=int,
+        default=10000,
+        help=
+        'protein domains module: overlap of sequences in two consecutive windows')
+    protOpt.add_argument(
+        '-thsc',
+        '--threshold_score',
+        type=int,
+        default=80,
+        help=
+        'protein domains module: percentage of the best score within the cluster to  significant domains')
+    protOpt.add_argument("-thl",
+                         "--th_length",
+                         type=float,
+                         choices=[Range(0.0, 1.0)],
+                         default=0.8,
+                         help="proportion of alignment length threshold")
+    protOpt.add_argument("-thi",
+                         "--th_identity",
+                         type=float,
+                         choices=[Range(0.0, 1.0)],
+                         default=0.35,
+                         help="proportion of alignment identity threshold")
+    protOpt.add_argument(
+        "-ths",
+        "--th_similarity",
+        type=float,
+        choices=[Range(0.0, 1.0)],
+        default=0.45,
+        help="threshold for alignment proportional similarity")
+    protOpt.add_argument(
+        "-ir",
+        "--interruptions",
+        type=int,
+        default=3,
+        help=
+        "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA")
+    protOpt.add_argument(
+        "-mlen",
+        "--max_len_proportion",
+        type=float,
+        default=1.2,
+        help=
+        "maximal proportion of alignment length to the original length of protein domain from database")
+
+    ################ OUTPUTS ###########################################
+    outOpt.add_argument('-lg',
+                        '--log_file',
+                        type=str,
+                        default=LOG_FILE,
+                        help='path to log file')
+    outOpt.add_argument('-ouf',
+                        '--output_gff',
+                        type=str,
+                        default=REPEATS_GFF,
+                        help='path to output gff of repetitive regions')
+    outOpt.add_argument('-oug',
+                        '--domain_gff',
+                        type=str,
+                        default=DOMAINS_GFF,
+                        help='path to output gff of protein domains')
+    outOpt.add_argument('-oun',
+                        '--n_gff',
+                        type=str,
+                        default=N_GFF,
+                        help='path to output gff of N regions')
+    outOpt.add_argument('-hf',
+                        '--html_file',
+                        type=str,
+                        default=HTML,
+                        help='path to output html file')
+    outOpt.add_argument('-hp',
+                        '--html_path',
+                        type=str,
+                        default=PROFREP_OUTPUT_DIR,
+                        help='path to html extra files')
+
+    ################ HITS/COPY NUMBERS ####################################
+    cnOpt.add_argument('-cn',
+                       '--copy_numbers',
+                       type=str2bool,
+                       default=False,
+                       help='convert hits to copy numbers')
+    cnOpt.add_argument(
+        '-gs',
+        '--genome_size',
+        type=float,
+        help=
+        'genome size is required when converting hits to copy numbers and you use custom data')
+    cnOpt.add_argument(
+        '-thr',
+        '--threshold_repeat',
+        type=int,
+        default=3,
+        help=
+        'threshold for hits/copy numbers per position to be considered repetitive')
+    cnOpt.add_argument(
+        '-thsg',
+        '--threshold_segment',
+        type=int,
+        default=80,
+        help='threshold for the length of repetitive segment to be reported')
+
+    ################ JBrowse ##########################
+    galaxyOpt.add_argument('-jb',
+                           '--jbrowse_bin',
+                           type=str,
+                           help='path to JBrowse bin directory')
+
+    args = parser.parse_args()
+    main(args)
author	petr-novak
date	Wed, 26 Jun 2019 08:01:42 -0400
parents
children