Repository 'dante'
hg clone https://toolshed.g2.bx.psu.edu/repos/petr-novak/dante

Changeset 15:3151a72a6671 (2019-09-03)
Previous changeset 14:a6c55d1bdb6c (2019-08-28) Next changeset 16:0e820310d4dc (2019-09-04)
Commit message:
Uploaded
modified:
dante.py
dante.xml
dante_gff_output_filtering.py
parse_aln.py
added:
coverage2gff.py
test-data/GEPY_test_long_1_output_unfiltered.gff3
removed:
dante_pyan_scheme.png
dante_pyan_scheme.svg
b
diff -r a6c55d1bdb6c -r 3151a72a6671 coverage2gff.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/coverage2gff.py Tue Sep 03 05:20:02 2019 -0400
[
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import argparse
+import tempfile
+import shutil
+import sys
+
+def parse_args():
+    '''Argument parsin'''
+    description = """
+    parsing cap3 assembly aln output
+    """
+
+    parser = argparse.ArgumentParser(
+        description=description,
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        '-g',
+        '--gff_file',
+        default=None,
+        required=True,
+        help="input gff3 file for appending coverage information",
+        type=str,
+        action='store')
+    parser.add_argument(
+        '-p',
+        '--profile',
+        default=None,
+        required=True,
+        help="output file for coverage profile",
+        type=str,
+        action="store")
+    return parser.parse_args()
+
+def read_coverage(profile):
+    with open(profile) as p:
+        d = {}
+        for name, prof in zip(p, p):
+            d[name[1:].strip()] = [int(i) for i in prof.split()]
+    print(d, file=sys.stderr)
+    return d
+
+
+def main():
+    args = parse_args()
+    coverage_hash = read_coverage(args.profile)
+    gff_tmp = tempfile.NamedTemporaryFile()
+    with open(args.gff_file) as f, open(gff_tmp.name, 'w') as out:
+        for line in f:
+            if line[0] == "#":
+                out.write(line)
+            else:
+                line_parts = line.split()
+                start = int(line_parts[3])
+                end = int(line_parts[4])
+                coverage = round( sum(coverage_hash[line_parts[0]][(
+                    start - 1):end]) / (end - start + 1), 3)
+                new_line = "{};Coverage={}\n".format(line.strip(), coverage)
+                out.write(new_line)
+
+    shutil.copyfile(gff_tmp.name, args.gff_file)
+
+
+if __name__ == "__main__":
+
+    main()
b
diff -r a6c55d1bdb6c -r 3151a72a6671 dante.py
--- a/dante.py Wed Aug 28 08:08:47 2019 -0400
+++ b/dante.py Tue Sep 03 05:20:02 2019 -0400
[
@@ -586,10 +586,10 @@
             if count_region == len(indices_plus):
                 strand_gff = "-"
             if strand_gff == "+":
-                feature_start = min(start_hit[regions_above_threshold])-1
+                feature_start = min(start_hit[regions_above_threshold]) + 1
                 feature_end = max(end_hit[regions_above_threshold])
             else:
-                feature_end = seq_len[region][0] - min(start_hit[regions_above_threshold]) - 1
+                feature_end = seq_len[region][0] - min(start_hit[regions_above_threshold])
                 feature_start = seq_len[region][0] - max(end_hit[regions_above_threshold]) + 1
             create_gff3(domain_type, ann_substring, unique_annotations,
                         ann_pos_counts, feature_start,feature_end,
b
diff -r a6c55d1bdb6c -r 3151a72a6671 dante.xml
--- a/dante.xml Wed Aug 28 08:08:47 2019 -0400
+++ b/dante.xml Tue Sep 03 05:20:02 2019 -0400
b
b'@@ -6,123 +6,185 @@\n     <requirement type="package" version="1.0">rexdb</requirement>\n     <requirement type="set_environment">REXDB</requirement>\n   </requirements>\n-<stdio>\n-  <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />\n-  <regex match="error" source="stderr" level="fatal" description="Unknown error" />\n-</stdio>\n-<command>\n-python3 ${__tool_directory__}/dante.py --query ${input} --domain_gff ${DomGff}\n-\t--protein_database \\${REXDB}/${db_type}_pdb\n-\t--classification \\${REXDB}/${db_type}_class\n-  --scoring_matrix ${scoring_matrix}\n-  &amp;&amp;\n+  <stdio>\n+    <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />\n+    <regex match="error" source="stderr" level="fatal" description="Unknown error" />\n+  </stdio>\n+  <command>\n+    #if str($input_type.input_type_selector) == "aln"\n+      python3 ${__tool_directory__}/parse_aln.py -a $(input_sequences) -f sequences.fasta -p sequences.profile\n+      &amp;&amp;\n+      INPUT_SEQUENCES="sequences.fasta"\n+    #else    \n+      INPUT_SEQUENCES=$(input_sequences)\n+    #end if\n+    &amp;&amp;\n+\n+\n+    python3 ${__tool_directory__}/dante.py --query \\${INPUT_SEQUENCES} --domain_gff ${DomGff}\n+\t  --protein_database \\${REXDB}/${db_type}_pdb\n+\t  --classification \\${REXDB}/${db_type}_class\n+    --scoring_matrix ${scoring_matrix}\n+\n+\n+    #if str($input_type.input_type_selector) == "aln"\n+     &amp;&amp;\n+     python3 ${__tool_directory__}/coverage2gff.py -p sequences.profile -g ${DomGff}\n+    #end if\n \n-python3 ${__tool_directory__}/dante_gff_output_filtering.py --dom_gff ${DomGff}\n---domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff\n---output_dir .\n---selected_dom All --th_identity 0.35\n---th_similarity 0.45 --th_length 0.9\n---interruptions 1 --max_len_proportion 1.1\n---element_type \'\'  &amp;&amp;\n+    #if str($iterative) == "Yes"\n+    &amp;&amp;\n+    python3 ${__tool_directory__}/dante_gff_output_filtering.py --dom_gff ${DomGff}\n+    --domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff\n+    --output_dir .\n+    --selected_dom All --th_identity 0.35\n+    --th_similarity 0.45 --th_length 0.9\n+    --interruptions 1 --max_len_proportion 1.1\n+    --element_type \'\'\n+    &amp;&amp;\n \n-python3 ${__tool_directory__}/fasta2database.py domains_filtered.fasta domains_filtered.db\n-domains_filtered.class  &amp;&amp;\n+\n+\n+    python3 ${__tool_directory__}/fasta2database.py domains_filtered.fasta domains_filtered.db\n+    domains_filtered.class\n+    &amp;&amp;\n \n-lastdb -p domains_filtered.db domains_filtered.db  &amp;&amp;\n+    lastdb -p domains_filtered.db domains_filtered.db\n+    &amp;&amp;\n+\n+    python3 ${__tool_directory__}/dante.py --query \\${INPUT_SEQUENCES} --domain_gff ${DomGff2}\n+\t  --protein_database domains_filtered.db\n+\t  --classification domains_filtered.class\n+    --scoring_matrix BL80\n+\n \n-python3 ${__tool_directory__}/dante.py --query ${input} --domain_gff ${DomGff2}\n-\t--protein_database domains_filtered.db\n-\t--classification domains_filtered.class\n-  --scoring_matrix BL80\n+    #if str($input_type.input_type_selector) == "aln"\n+     &amp;&amp;\n+     python3 ${__tool_directory__}/coverage2gff.py -p sequences.profile -g ${DomGff2}\n+    #end if\n+    #end if\n \n-</command>\n-<inputs>\n-  <param format="fasta" type="data" name="input"\n-  label="Choose your input sequence" help="Input DNA must be in proper fasta format, multi-fasta containing more sequences is allowed" />\n+  </command>\n+  <inputs>\n \n- <param name="db_type" type="select" label="Select taxon and protein domain database version (REXdb)" help="">\n-   <options from_file="rexdb_versions.loc">\n-     <column name="name" index="0"/>\n-     <column name="value" index="1"/>\n-   </options>\n- </param>\n+    <conditional name="input_type">\n+      <param name="input_type_selector" type="select" label="Choose the type of sequence data">\n+        <option value="fasta" selected="true">Fasta</option>\n+        <option value='..b'ing assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain\'s final classification is equivalent to this unique classification.\n+\t  2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings\n+\t  3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous\n+\t  \n+    **There are 2 outputs produced by this tool:**\n+\t  \n+    1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool\n \n-\tName\n+    - Attributes reported always:\n+\n+\t  Name\n \t\ttype of domain; if ambiguous reported with slash \n \t\t\n-\tFinal_classification \n+\t  Final_classification \n \t\tdefinite classification based on all partial classifications of Region_hits_classifications attribute or \n \t\t"Ambiguous_domain" when there is an ambiguous domain type \n \t\t\n-\tRegion_Hits_Classifications\n+\t  Region_Hits_Classifications\n \t\tall hits classifications (comma separated) from a certain domain region that reach the set score threshold; in case of multiple annotations the square brackets indicate the number of bases having this particular classification\t\t\n-\t\t\t\n-- Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region):\n-\t\n-\tBest_hit  \n+\t\t\n+    - Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region):\n+\t  \n+\t  Best_hit  \n \t\tclassification and position of the best alignment with the highest score within the cluster; in the square brackets is the percentage of the whole cluster range that this best hit covers\n \t\t\n-\tBest_Hit_DB_Pos\n+\t  Best_Hit_DB_Pos\n \t\tshowing which part of the original datatabase domain corresponding to the Best Hit was aligned on query DNA (e.g. **Best_Hit_DB_Pos=17:75of79** means the Best Hit reported in GFF represents region from 17th to 75th of total 79 aminoacids in the original domain from the database)\n \t\t\n-\tDB_Seq \n+\t  DB_Seq \n \t\tdatabase protein sequence of the best hit mapped to the query DNA\n \t\t\n-\tQuery_Seq \n+\t  Query_Seq \n \t\talignment sequence of the query DNA for the best hit\t\n \t\t\n-\tIdentity\n+\t  Identity\n \t\tratio of identical amino acids in alignment sequence to the length of alignment\n \t\t\n-\tSimilarity\n+\t  Similarity\n \t\tratio of alignment positions with positive score (according to the scoring matrix) to the length of alignment\n \t\t\n-\tRelat_Length\n+\t  Relat_Length\n \t\tratio of gapless length of the aligned protein sequence to the whole length of the database protein \n \t\t\n-\tRelat_Interruptions\n+\t  Relat_Interruptions\n \t\tnumber of the interruptions (frameshifts + stop codons) in aligned translated query sequence per each starting 100 AA\n \t\t\n-\tHit_to_DB_Length\n+\t  Hit_to_DB_Length\n \t\tproportion of alignment length to the original length of the protein domain from database\n-\t\n-\t\n+\t  \n+\t  \n \n-!NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days.\n+    !NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days.\n \n- </help>\n+  </help>\n </tool>\n \n'
b
diff -r a6c55d1bdb6c -r 3151a72a6671 dante_gff_output_filtering.py
--- a/dante_gff_output_filtering.py Wed Aug 28 08:08:47 2019 -0400
+++ b/dante_gff_output_filtering.py Tue Sep 03 05:20:02 2019 -0400
[
@@ -82,6 +82,22 @@
     return count_comment, lines
 
 
+def parse_gff_line(line):
+    '''Return dictionary with gff fields  and  atributers
+    Note - type of fields is strings
+    '''
+    # order of first 9 column is fixed
+    gff_line = dict(
+        zip(
+            ['seqid', 'source', 'type', 'start', 'end',
+             'score', 'strand', 'phase', 'attributes'],
+            line.split("\t")
+        )
+    )
+    # split attributes and replace:
+    gff_line['attributes'] = dict([i.split("=") for i in gff_line['attributes'].split(";")])
+    return gff_line
+
 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY,
                     TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM,
                     ELEMENT):
@@ -90,7 +106,7 @@
     filt_dom_tmp = NamedTemporaryFile(delete=False)
     with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name,
                                              "w") as gff_filtered:
-        for comment_idx in range(count_comment):
+        for _ in range(count_comment):
             next(gff_all)
         dom_dict = defaultdict(lambda: defaultdict(int))
         orig_class_dict = defaultdict(int)
@@ -109,20 +125,22 @@
             orig_class_dict[classification] += 1
             ## ambiguous domains filtered out automatically
             if classification != configuration.AMBIGUOUS_TAG:
-                al_identity = float(attributes.split(";")[-5].split("=")[1])
-                al_similarity = float(attributes.split(";")[-4].split("=")[1])
-                al_length = float(attributes.split(";")[-3].split("=")[1])
-                relat_interrupt = float(attributes.split(";")[-2].split("=")[
-                    1])
-                db_len_proportion = float(attributes.split(";")[-1].split("=")[
-                    1])
-                dom_type = attributes.split(";")[0].split("=")[1]
-                seq_id = line.split("\t")[0]
-                xminimal = int(line.split("\t")[3])
-                xmaximal = int(line.split("\t")[4])
-                if al_identity >= TH_IDENTITY and al_similarity >= TH_SIMILARITY and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and db_len_proportion <= TH_LEN_RATIO and (
-                        dom_type == SELECTED_DOM or
-                        SELECTED_DOM == "All") and (ELEMENT in classification):
+                gff_line = parse_gff_line(line)
+                al_identity = float(gff_line['attributes']['Identity'])
+                al_similarity = float(gff_line['attributes']['Similarity'])
+                al_length = float(gff_line['attributes']['Relat_Length'])
+                relat_interrupt = float(gff_line['attributes']['Relat_Interruptions'])
+                db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length'])
+                dom_type = gff_line['attributes']['Final_Classification']
+                seq_id = gff_line['seqid']
+                xminimal = int(gff_line['start'])
+                xmaximal = int(gff_line['end'])
+                c1 = al_identity >= TH_IDENTITY
+                c2 = al_similarity >= TH_SIMILARITY
+                if (c1 and c2 and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and
+                        db_len_proportion <= TH_LEN_RATIO and
+                        (dom_type == SELECTED_DOM or SELECTED_DOM == "All") and
+                        (ELEMENT in classification)):
                     gff_filtered.writelines(line)
                     filt_class_dict[classification] += 1
                     dom_dict[seq_id][dom_type] += 1
b
diff -r a6c55d1bdb6c -r 3151a72a6671 dante_pyan_scheme.png
b
Binary file dante_pyan_scheme.png has changed
b
diff -r a6c55d1bdb6c -r 3151a72a6671 dante_pyan_scheme.svg
--- a/dante_pyan_scheme.svg Wed Aug 28 08:08:47 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b'@@ -1,326 +0,0 @@\n-<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"\n- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n-<!-- Generated by graphviz version 2.36.0 (20140111.2315)\n- -->\n-<!-- Title: G Pages: 1 -->\n-<svg width="2270pt" height="436pt"\n- viewBox="0.00 0.00 2270.00 436.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">\n-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 432)">\n-<title>G</title>\n-<polygon fill="white" stroke="none" points="-4,4 -4,-432 2266,-432 2266,4 -4,4"/>\n-<g id="clust1" class="cluster"><title>cluster_G</title>\n-<path fill="#808080" fill-opacity="0.094118" stroke="black" d="M20,-8C20,-8 2242,-8 2242,-8 2248,-8 2254,-14 2254,-20 2254,-20 2254,-408 2254,-408 2254,-414 2248,-420 2242,-420 2242,-420 20,-420 20,-420 14,-420 8,-414 8,-408 8,-408 8,-20 8,-20 8,-14 14,-8 20,-8"/>\n-</g>\n-<!-- dante -->\n-<g id="node1" class="node"><title>dante</title>\n-<ellipse fill="#ffffff" fill-opacity="0.698039" stroke="black" cx="1004" cy="-394" rx="29.3479" ry="18"/>\n-<text text-anchor="middle" x="1004" y="-390.3" font-family="Times,serif" font-size="14.00" fill="#000000">dante</text>\n-</g>\n-<!-- dante__CustomFormatter -->\n-<g id="node2" class="node"><title>dante__CustomFormatter</title>\n-<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="89" cy="-322" rx="73.1654" ry="18"/>\n-<text text-anchor="middle" x="89" y="-318.3" font-family="Times,serif" font-size="14.00" fill="#000000">CustomFormatter</text>\n-</g>\n-<!-- dante&#45;&gt;dante__CustomFormatter -->\n-<g id="edge4" class="edge"><title>dante&#45;&gt;dante__CustomFormatter</title>\n-<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M974.471,-392.552C866.016,-390.536 480.929,-380.546 167,-340 160.79,-339.198 154.356,-338.221 147.963,-337.146"/>\n-<polygon fill="#838b8b" stroke="#838b8b" points="148.443,-333.677 137.987,-335.386 147.227,-340.57 148.443,-333.677"/>\n-</g>\n-<!-- dante&#45;&gt;dante__CustomFormatter -->\n-<g id="edge21" class="edge"><title>dante&#45;&gt;dante__CustomFormatter</title>\n-<path fill="none" stroke="#000000" d="M974.392,-392.41C867.401,-390.09 492.144,-379.67 185,-340 175.598,-338.786 165.683,-337.171 155.921,-335.42"/>\n-<polygon fill="#000000" stroke="#000000" points="156.301,-331.931 145.832,-333.556 155.029,-338.814 156.301,-331.931"/>\n-</g>\n-<!-- dante__adjust_gff -->\n-<g id="node3" class="node"><title>dante__adjust_gff</title>\n-<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="505" cy="-178" rx="46.2191" ry="18"/>\n-<text text-anchor="middle" x="505" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">adjust_gff</text>\n-</g>\n-<!-- dante&#45;&gt;dante__adjust_gff -->\n-<g id="edge7" class="edge"><title>dante&#45;&gt;dante__adjust_gff</title>\n-<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M977.391,-385.915C943.907,-376.703 884.868,-359.47 836,-340 821.049,-334.043 624.595,-237.732 541.603,-196.983"/>\n-<polygon fill="#838b8b" stroke="#838b8b" points="543.049,-193.794 532.53,-192.528 539.964,-200.077 543.049,-193.794"/>\n-</g>\n-<!-- dante__alignment_scoring -->\n-<g id="node4" class="node"><title>dante__alignment_scoring</title>\n-<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="219" cy="-34" rx="76.5394" ry="18"/>\n-<text text-anchor="middle" x="219" y="-30.3" font-family="Times,serif" font-size="14.00" fill="#000000">alignment_scoring</text>\n-</g>\n-<!-- dante&#45;&gt;dante__alignment_scoring -->\n-<g id="edge20" class="edge"><title>dante&#45;&gt;dante__alignment_scoring</title>\n-<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M974.566,-392.156C863.693,-388.738 471.497,-374.426 352,-340 269.671,-316.281 181,-336.678 181,-251 181,-251 181,-251 181,-177 181,-137.196 179.276,-126.039 191,-88 193.85,-78.7512 198.325,-69.2277 202.868,-60.8496"/>\n-<polygon fill="#838b8b" stroke="#838b8b" points="206.028,-62.3733 207.942,-51.9529 199.948,-58.9'..b'193.856 1469.26,-192.655"/>\n-<polygon fill="#000000" stroke="#000000" points="1470.08,-196.057 1479.07,-190.439 1468.54,-189.229 1470.08,-196.057"/>\n-</g>\n-<!-- dante__domain_search&#45;&gt;dante__overlapping_regions -->\n-<g id="edge35" class="edge"><title>dante__domain_search&#45;&gt;dante__overlapping_regions</title>\n-<path fill="none" stroke="#000000" d="M1138.58,-244.552C1238.31,-236.976 1441.24,-220.024 1612,-196 1619.66,-194.923 1627.64,-193.666 1635.58,-192.333"/>\n-<polygon fill="#000000" stroke="#000000" points="1636.32,-195.757 1645.58,-190.611 1635.13,-188.859 1636.32,-195.757"/>\n-</g>\n-<!-- dante__domain_search&#45;&gt;dante__score_matrix_evaluation -->\n-<g id="edge27" class="edge"><title>dante__domain_search&#45;&gt;dante__score_matrix_evaluation</title>\n-<path fill="none" stroke="#000000" d="M1139.77,-246.158C1265.04,-240.095 1554.84,-224.099 1797,-196 1806.06,-194.949 1815.52,-193.687 1824.91,-192.331"/>\n-<polygon fill="#000000" stroke="#000000" points="1825.67,-195.757 1835.05,-190.828 1824.64,-188.832 1825.67,-195.757"/>\n-</g>\n-<!-- dante__domain_search&#45;&gt;dante__score_table -->\n-<g id="edge39" class="edge"><title>dante__domain_search&#45;&gt;dante__score_table</title>\n-<path fill="none" stroke="#000000" d="M1013.28,-247.64C899.371,-244.318 653.662,-233.105 450,-196 445.566,-195.192 440.988,-194.198 436.43,-193.099"/>\n-<polygon fill="#000000" stroke="#000000" points="437.103,-189.658 426.546,-190.557 435.359,-196.437 437.103,-189.658"/>\n-</g>\n-<!-- dante__domain_search&#45;&gt;dante__split_fasta -->\n-<g id="edge32" class="edge"><title>dante__domain_search&#45;&gt;dante__split_fasta</title>\n-<path fill="none" stroke="#000000" d="M1140.69,-247.36C1322.78,-242.358 1843.19,-225.792 2011,-196 2015.16,-195.261 2019.45,-194.312 2023.72,-193.243"/>\n-<polygon fill="#000000" stroke="#000000" points="2024.83,-196.569 2033.56,-190.567 2022.99,-189.814 2024.83,-196.569"/>\n-</g>\n-<!-- dante__domain_search&#45;&gt;dante__write_info -->\n-<g id="edge37" class="edge"><title>dante__domain_search&#45;&gt;dante__write_info</title>\n-<path fill="none" stroke="#000000" d="M1140.79,-247.762C1337.32,-243.633 1931.78,-228.703 2122,-196 2126.24,-195.272 2130.6,-194.331 2134.94,-193.266"/>\n-<polygon fill="#000000" stroke="#000000" points="2136.19,-196.555 2144.95,-190.597 2134.39,-189.791 2136.19,-196.555"/>\n-</g>\n-<!-- dante__filter_params&#45;&gt;dante__alignment_scoring -->\n-<g id="edge25" class="edge"><title>dante__filter_params&#45;&gt;dante__alignment_scoring</title>\n-<path fill="none" stroke="#000000" d="M248.559,-88.055C244.075,-80.0067 238.614,-70.2046 233.612,-61.2259"/>\n-<polygon fill="#000000" stroke="#000000" points="236.549,-59.3075 228.625,-52.2753 230.434,-62.7145 236.549,-59.3075"/>\n-</g>\n-<!-- dante__line_generator&#45;&gt;dante__line_generator -->\n-<g id="edge26" class="edge"><title>dante__line_generator&#45;&gt;dante__line_generator</title>\n-<path fill="none" stroke="#000000" d="M1565.86,-191.203C1585.63,-192.737 1603,-188.336 1603,-178 1603,-169.521 1591.31,-165.036 1576.2,-164.545"/>\n-<polygon fill="#000000" stroke="#000000" points="1575.77,-161.055 1565.86,-164.797 1575.94,-168.053 1575.77,-161.055"/>\n-</g>\n-<!-- dante__main&#45;&gt;dante__domain_search -->\n-<g id="edge41" class="edge"><title>dante__main&#45;&gt;dante__domain_search</title>\n-<path fill="none" stroke="#000000" d="M1065.63,-304.055C1067.3,-296.261 1069.32,-286.822 1071.2,-278.079"/>\n-<polygon fill="#000000" stroke="#000000" points="1074.63,-278.787 1073.3,-268.275 1067.78,-277.32 1074.63,-278.787"/>\n-</g>\n-<!-- dante__score_table&#45;&gt;dante__annotations_dict -->\n-<g id="edge23" class="edge"><title>dante__score_table&#45;&gt;dante__annotations_dict</title>\n-<path fill="none" stroke="#000000" d="M426.204,-165.403C433.984,-163.289 442.21,-161.334 450,-160 727.01,-112.576 1061.92,-106.495 1211.37,-106.404"/>\n-<polygon fill="#000000" stroke="#000000" points="1211.38,-109.904 1221.38,-106.407 1211.38,-102.904 1211.38,-109.904"/>\n-</g>\n-</g>\n-</svg>\n'
b
diff -r a6c55d1bdb6c -r 3151a72a6671 parse_aln.py
--- a/parse_aln.py Wed Aug 28 08:08:47 2019 -0400
+++ b/parse_aln.py Tue Sep 03 05:20:02 2019 -0400
[
@@ -4,6 +4,7 @@
 profile file
 '''
 import argparse
+import re
 
 
 def parse_args():
@@ -11,33 +12,126 @@
     description = """
     parsing cap3 assembly aln output
     """
-    parser = argparse.ArgumentParser(description=description,
-                                     formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument(
-        '-a', '--aln_file',
-        default=None, required=True,
-        help="Aln file input",
-        type=str,
-        action='store')
-    parser.add_argument(
-        '-f', '--fasta',
-        default=None, required=True,
-        help="fasta output file name",
-        type=str,
-        action='store')
-    parser.add_argument(
-        '-p', '--profile',
-        default=None, required=True,
-        help="output file for coverage profile",
-        type=str,
-        action="store"
-    )
+
+    parser = argparse.ArgumentParser(
+        description=description,
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('-a',
+                        '--aln_file',
+                        default=None,
+                        required=True,
+                        help="Aln file input",
+                        type=str,
+                        action='store')
+    parser.add_argument('-f',
+                        '--fasta',
+                        default=None,
+                        required=True,
+                        help="fasta output file name",
+                        type=str,
+                        action='store')
+    parser.add_argument('-p',
+                        '--profile',
+                        default=None,
+                        required=True,
+                        help="output file for coverage profile",
+                        type=str,
+                        action="store")
     return parser.parse_args()
 
 
+def get_header(f):
+    aln_header = ".    :    .    :    .    :    .    :    .    :    .    :"
+    contig_lead = "******************"
+    aln_start = -1
+    while True:
+        line = f.readline()
+        if not line:
+            return None, None
+        if line[0:18] == contig_lead:
+            line2 = f.readline()
+        else:
+            continue
+        if aln_header in line2:
+            aln_start = line2.index(aln_header)
+            break
+    contig_name = line.split()[1] + line.split()[2]
+    return contig_name, aln_start
+
+
+def segment_start(f):
+    pos = f.tell()
+    line = f.readline()
+    # detect next contig or end of file
+    if "********" in line or line == "":
+        segment = False
+    else:
+        segment = True
+    f.seek(pos)
+    return segment
+
+
+def get_segment(f, seq_start):
+    if not segment_start(f):
+        return None, None
+    aln = []
+    while True:
+        line = f.readline()
+        if ".    :    .    :" in line:
+            continue
+        if "__________" in line:
+            consensus = f.readline().rstrip('\n')[seq_start:]
+            f.readline()  # empty line
+            break
+        else:
+            aln.append(line.rstrip('\n')[seq_start:])
+    return aln, consensus
+
+
+def aln2coverage(aln):
+    coverage = [0] * len(aln[0])
+    for a in aln:
+        for i, c in enumerate(a):
+            if c not in " -":
+                coverage[i] += 1
+    return coverage
+
+
+def read_contig(f, seq_start):
+    contig = ""
+    coverage = []
+    while True:
+        aln, consensus = get_segment(f, seq_start)
+        if aln:
+            contig += consensus
+            coverage += aln2coverage(aln)
+        else:
+            break
+    return contig, coverage
+
+def remove_gaps(consensus, coverage):
+    if "-" not in consensus:
+        return consensus, coverage
+    new_coverage = [cov for cons, cov in zip(consensus, coverage)
+                    if cons != "-"]
+    new_consensus = consensus.replace("-", "")
+    return new_consensus, new_coverage
+
+def main():
+    args = parse_args()
+    with open(args.aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta, open(args.profile, 'w') as fprofile:
+        while True:
+            contig_name, seq_start = get_header(f1)
+            if contig_name:
+                consensus, coverage = remove_gaps(*read_contig(f1, seq_start))
+                ffasta.write(">{}\n".format(contig_name))
+                ffasta.write("{}\n".format(consensus))
+                fprofile.write(">{}\n".format(contig_name))
+                fprofile.write("{}\n".format(" ".join([str(i) for i in coverage])))
+            else:
+                break
+
+
 if __name__ == "__main__":
 
-    args = parse_args()
-    print(args.profile)
-
-
+    main()
b
diff -r a6c55d1bdb6c -r 3151a72a6671 test-data/GEPY_test_long_1_output_unfiltered.gff3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/GEPY_test_long_1_output_unfiltered.gff3 Tue Sep 03 05:20:02 2019 -0400
[
b'@@ -0,0 +1,26 @@\n+##gff-version 3\n+##-----------------------------------------------\n+##PIPELINE VERSION         : iter_search_optional-rv-3168(0b80fa0)\n+##PROTEIN DATABASE VERSION : Viridiplantae_v3.0_pdb\n+##-----------------------------------------------\n+scaffold146.1|size86774\tdante\tprotein_domain\t976\t1289\t293\t+\t.\tName=RH;Final_Classification=Class_I|LTR|Ty1/copia|Bianca;Region_Hits_Classifications=RH|Class_I|LTR|Ty1/copia|Bianca;Best_Hit=Ty1-RH__REXdb_ID2558|Class_I|LTR|Ty1/copia|Bianca:976-1289[100percent];Best_Hit_DB_Pos=26:134of134;DB_Seq=ISWRSVKQTITATSSNHAELLALHEASRECVWLRSMIQHIQKNCG-LSSGRMDATIIYEDNTACIAQLKEGYIKGDRTKHISPKFF-FTHDLQKDGDISIQQIRSCDNLAD;Region_Seq=ISWRSTKQTIVAISSNHVELLAIHDTSRECVWLRFMIESI\\IMXXXXXXXXXXXXXXXXXXQLKE*YIKCDRTKHISPKFF\\FTQDLQKNGDVIIQQIRSNDNVVD;Query_Seq=ISWRSTKQTIVAISSNHVELLAIHDTSRECVWLRFMIESI-----\\IMXXXXXXXXXXXXXXXXXXQLKE*YIKCDRTKHISPKFF\\FTQDLQKNGDVIIQQIRSNDNVVD;Identity=0.59;Similarity=0.66;Relat_Length=0.813;Relat_Interruptions=1.5;Hit_to_DB_Length=0.83\n+scaffold146.1|size86774\tdante\tprotein_domain\t6810\t7049\t153\t+\t.\tName=PROT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Region_Hits_Classifications=PROT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Best_Hit=Ty3-PROT__REXdb_ID9702|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:6810-7049[100percent];Best_Hit_DB_Pos=1:80of80;DB_Seq=LVDDGSKVNLLPYRVFQQMGIPEEQLVRDQAPVKGIGGVPVLVEGKVKLALTLGEAPRTRTHYAVFLVVKPPLSYNAILG;Region_Seq=LVDSGASCNLMSKRVMKQMGIPDEKLEFLDATLYAFDRRTIIPAGKIQLPVTLGEEERTRSEMVEFIIVDMDLAYNAILG;Query_Seq=LVDSGASCNLMSKRVMKQMGIPDEKLEFLDATLYAFDRRTIIPAGKIQLPVTLGEEERTRSEMVEFIIVDMDLAYNAILG;Identity=0.44;Similarity=0.62;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0\n+scaffold146.1|size86774\tdante\tprotein_domain\t7656\t8296\t.\t+\t.\tName=RT/INT;Final_Classification=Ambiguous_domain;Region_Hits_Classifications_=RT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand[246bp],INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand[468bp]\n+scaffold146.1|size86774\tdante\tprotein_domain\t8756\t9241\t538\t+\t.\tName=RT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat;Region_Hits_Classifications=RT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand[486bp],RT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Ogre[441bp];Best_Hit=Ty3-RT__REXdb_ID8210|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:8801-9241[90percent];Best_Hit_DB_Pos=27:173of173;DB_Seq=DFTDLNKACPKDSFPLPHIDRLVDSTAGNELLTFMDAFSGYNQIMMNPEDQEKTSFITDRGIYCYKVMPFGLKNAGATYQRLVNKMFHNHLGKTMEVYIDDMLVKSLKKEDHVKHLEECFDILNKYQMKLNPAKCTFGVPSGEFLGY;Region_Seq=TSIATASGGRTSDGADFKGVNKHCQPDPFPLPHIDRLVDAVAGSSLLSTMDAYSGYHQISLAREDQAKSSFLTEDGVFCYVVMPFGLRNAGATYQRLVNKIFADLLGKEMEIYVDDMIVKSLNDEDHIIYLSHCFEVCRTHRLKLNPAKCCFGVRSGKFLGY;Query_Seq=DFKGVNKHCQPDPFPLPHIDRLVDAVAGSSLLSTMDAYSGYHQISLAREDQAKSSFLTEDGVFCYVVMPFGLRNAGATYQRLVNKIFADLLGKEMEIYVDDMIVKSLNDEDHIIYLSHCFEVCRTHRLKLNPAKCCFGVRSGKFLGY;Identity=0.63;Similarity=0.8;Relat_Length=0.85;Relat_Interruptions=0.0;Hit_to_DB_Length=0.85\n+scaffold146.1|size86774\tdante\tprotein_domain\t9434\t9781\t343\t+\t.\tName=RH;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Region_Hits_Classifications=RH|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Best_Hit=Ty3-RH__REXdb_ID9729|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:9434-9772[97percent];Best_Hit_DB_Pos=1:113of149;DB_Seq=WTEECEEAFQKLKEYLGSPHLLVKPIQGEPLFLYLAVSEHATSSVLVREDDGVQRPIYYTSRALVDAETRYLSLEKIVLALIVSARRLRPYFQAHTIIVLTDQPIRQVLAKPD;Region_Seq=WTDQCDRAFKELKTYLASPPLIVSPTPTETLGLYLAVSEHAVSSVLVAERDGVQHPVYYVSHTLLPAESRYSTVEKFVLALLKSVAKLRHYFESRKVIVYTDQPIKAVLGQSDHTS;Query_Seq=WTDQCDRAFKELKTYLASPPLIVSPTPTETLGLYLAVSEHAVSSVLVAERDGVQHPVYYVSHTLLPAESRYSTVEKFVLALLKSVAKLRHYFESRKVIVYTDQPIKAVLGQSD;Identity=0.58;Similarity=0.73;Relat_Length=0.758;Relat_Interruptions=0.0;Hit_to_DB_Length=0.76\n+scaffold146.1|size86774\tdante\tprotein_domain\t10810\t11667\t747\t+\t.\tName=INT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Region_Hits_Classific'..b'n-chromovirus|OTA|Athila;Best_Hit=Ty3-INT__REXdb_ID6633|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:16812-17666[98percent];Best_Hit_DB_Pos=1:285of313;DB_Seq=HSHSYGGHFGAKRTAHKVLESGFYWPSIFKDAYHFCKSCEKCQRTGNITHKNQMPLTNILVSEIFDVWGIDFMGPFPSSFGNLYILLVVDYVSKWIEAKATRTNDAKVVLDFVRTHIFNRFGIPKAIISDRGTHFCNRSMEALLRKYHVTHRTSTAYHPQTNGQAEISNREIKSILEKIVQPNRRDWSLRLGDALWAYRTAYKSPIGMSPYRMIYGKACHLPVELEHKAFWAIKQCNMDYDAAGIARKLQLQELEEIRNDAYENARIYKEKTKNLHDRMLTRKEF;Region_Seq=HASDYGGHFGPNRTARRILDVGFYWPSIFRDVYQFCRTCDACQRVGNITNRREMPQNYILANEIFDIWGLDFMGPFPQSQGNNYILVAVDYVSKWVEAIPTRTDDGKTVTEFLRKNIFTRYGVPKAIISDRGTHFCNSTMRAMMKKYNVIHKTTTAYHPQGNGQAEATNREIKSILEKVVNKKRSNWSQKLPDALWAYRTAYKTPIGTTPFRLIYGKHCNLPVGLEHKAYWAIREMNFEEGGDAELRQMQLQELDALRLEAYDNSRIYKERLKTYHDKKLLQQNFRERLS;Query_Seq=HASDYGGHFGPNRTARRILDVGFYWPSIFRDVYQFCRTCDACQRVGNITNRREMPQNYILANEIFDIWGLDFMGPFPQSQGNNYILVAVDYVSKWVEAIPTRTDDGKTVTEFLRKNIFTRYGVPKAIISDRGTHFCNSTMRAMMKKYNVIHKTTTAYHPQGNGQAEATNREIKSILEKVVNKKRSNWSQKLPDALWAYRTAYKTPIGTTPFRLIYGKHCNLPVGLEHKAYWAIREMNFEEGGDAELRQMQLQELDALRLEAYDNSRIYKERLKTYHDKKLLQQNF;Identity=0.61;Similarity=0.79;Relat_Length=0.911;Relat_Interruptions=0.0;Hit_to_DB_Length=0.91\n+scaffold146.1|size86774\tdante\tprotein_domain\t18554\t18811\t306\t-\t.\tName=INT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-INT__REXdb_ID6693|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:18554-18802[96percent];Best_Hit_DB_Pos=231:313of313;DB_Seq=WALRLLNFDNNACGEKRKLQLQELEEMRLNAYESSRIYKERTKAYHDKKLQRREFQPGQQVLLFNSRLRLFPGKLKSKWSGPF;Region_Seq=QGNWAIREMNFEEGGDAELRQMQLQELDALRLEAYDNSRIYKERLKAYHDKKILQQNFREGQQVLLFNSKLRLFPGKLKSRWMGPF;Query_Seq=WAIREMNFEEGGDAELRQMQLQELDALRLEAYDNSRIYKERLKAYHDKKILQQNFREGQQVLLFNSKLRLFPGKLKSRWMGPF;Identity=0.65;Similarity=0.82;Relat_Length=0.265;Relat_Interruptions=0.0;Hit_to_DB_Length=0.27\n+scaffold146.1|size86774\tdante\tprotein_domain\t19158\t19478\t197\t-\t.\tName=INT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-INT__REXdb_ID6659|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:19182-19448[83percent];Best_Hit_DB_Pos=216:304of314;DB_Seq=YGKPCHLPVELEHKAWWAVKQCNMELDVAGQHRxLQLQELEEIRNDAYESSxIYKEKTKAFHDKQILRKNFEVGQKVLIFHSRLKLFPG;Region_Seq=PRGTISIGLNFGKQCKVLVGMEHENYWEIREMNYEEGADVEQKQMQLQKMDALKLEAYDNSRIDKEKLKAHHAKRILQQNCKKRQQVLIFDSKLKMFPGIPRWMEPF;Query_Seq=FGKQCKVLVGMEHENYWEIREMNYEEGADVEQKQMQLQKMDALKLEAYDNSRIDKEKLKAHHAKRILQQNCKKRQQVLIFDSKLKMFPG;Identity=0.42;Similarity=0.71;Relat_Length=0.283;Relat_Interruptions=0.0;Hit_to_DB_Length=0.28\n+scaffold146.1|size86774\tdante\tprotein_domain\t19976\t20212\t259\t-\t.\tName=PROT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=PROT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-PROT__REXdb_ID6659|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:19976-20212[100percent];Best_Hit_DB_Pos=1:80of80;DB_Seq=MLDLGASINVMPYSIYNSLNLGPMEETCIIIQLADRSNAYPKGVMEDVLVQVNELVFPADFYILKMEDELSPNPTPILLG;Region_Seq=MVDLGASINLMPYYIYSALKLGSLQGTAIIIKLADRSETHPEGVVKDVLAQVNNLVFPADFYVLKMGEAENDDCPLLLG;Query_Seq=MVDLGASINLMPYYIYSALKLGSLQGTAIIIKLADRSETHPEGVVKDVLAQVNNLVFPADFYVLKM-GEAENDDCPLLLG;Identity=0.62;Similarity=0.79;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0\n+scaffold146.1|size86774\tdante\tprotein_domain\t28912\t29124\t216\t-\t.\tName=PROT;Final_Classification=Class_I|LTR|Ty1/copia|Bianca;Region_Hits_Classifications=PROT|Class_I|LTR|Ty1/copia|Bianca;Best_Hit=Ty1-PROT__REXdb_ID2599|Class_I|LTR|Ty1/copia|Bianca:28912-29124[100percent];Best_Hit_DB_Pos=1:71of71;DB_Seq=CLADCATTHTILRDKRYFLELTLIKANVSTISGTTNLVEGSGRANIMLPNGTRFHINDALYSSKSRRNLLS;Region_Seq=CLVDSATTHTILKNMRYFTSFEKRDVNIATIVCEANIVEGSGRAVIVLPSGTHIRIDDALYANKSRRNLLS;Query_Seq=CLVDSATTHTILKNMRYFTSFEKRDVNIATIVCEANIVEGSGRAVIVLPSGTHIRIDDALYANKSRRNLLS;Identity=0.59;Similarity=0.7;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0\n'