Mercurial > repos > davidvanzessen > shm_csr
annotate split_imgt_file.py @ 94:84e9e5c8c101 draft
"planemo upload commit d4be85014b638f1d50b318d4b735be7f6e973140"
author | rhpvorderman |
---|---|
date | Fri, 24 Mar 2023 16:58:28 +0000 |
parents | cf8ad181628f |
children |
rev | line source |
---|---|
92
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
2 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
3 """ |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
4 Script to split IMGT file into several archives for each of the genes |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
5 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
6 Rather than creating each new archive individually this script will read |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
7 the input files only once and as such enormously shorten processing time. |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
8 """ |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
9 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
10 import argparse |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
11 import io |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
12 import os |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
13 import tarfile |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
14 import tempfile |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
15 from typing import Iterator, List, Tuple |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
16 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
17 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
18 def merged_txt_to_match_dict(merged: str): |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
19 with open(merged, "rt") as f: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
20 header = next(f).strip("\n") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
21 column_names = header.split("\t") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
22 # For the baseline result there is no best_match column |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
23 if "best_match" in column_names: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
24 best_match_index = column_names.index("best_match") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
25 else: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
26 best_match_index = None |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
27 sequence_id_index = column_names.index("Sequence.ID") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
28 match_dict = {} |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
29 for line in f: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
30 values = line.strip().split("\t") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
31 sequence_id = values[sequence_id_index] |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
32 if best_match_index is not None: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
33 best_match = values[best_match_index] |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
34 if "unmatched" in best_match: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
35 # For some reason the table has values such as: unmatched, IGA2 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
36 continue |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
37 else: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
38 best_match = "" |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
39 match_dict[sequence_id] = best_match |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
40 return match_dict |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
41 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
42 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
43 def imgt_to_tables(imgt_file: str) -> Iterator[Tuple[str, io.TextIOWrapper]]: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
44 print(f"opening IMGT file: {imgt_file}") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
45 with tarfile.open(imgt_file, "r") as archive: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
46 while True: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
47 member = archive.next() |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
48 if member is None: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
49 return |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
50 if member.name in {"README.txt"}: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
51 continue |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
52 if member.name.startswith("11_"): |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
53 continue |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
54 f = archive.extractfile(member) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
55 f_text = io.TextIOWrapper(f) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
56 yield member.name, f_text |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
57 f_text.close() |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
58 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
59 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
60 def split_imgt(imgt_file: str, merged_file: str, outdir: str, genes: List[str], |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
61 prefix: str): |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
62 """ |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
63 This function creates a separate tar file for each of the gene matches |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
64 based on the merged file. Unmatched genes are left out. |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
65 :param imgt_file: The original IMGT file |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
66 :param merged_file: The merged data file generated by SHM&CSR pipeline |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
67 :param outdir: The output directory. |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
68 :param genes: The genes to split out. Use '-' for all identified genes. |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
69 :return: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
70 """ |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
71 match_dict = merged_txt_to_match_dict(merged_file) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
72 gene_tarfiles = [] |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
73 os.makedirs(outdir, exist_ok=True) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
74 for gene in genes: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
75 new_filename = f"{prefix}_{gene}.txz" if gene else f"{prefix}.txz" |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
76 gene_tarfiles.append( |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
77 tarfile.open(os.path.join(outdir, new_filename), mode="w:xz") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
78 ) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
79 for name, table in imgt_to_tables(imgt_file): |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
80 # Read each table one by one and per line select in which output |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
81 # files it should go. |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
82 gene_files = [] |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
83 for gene in genes: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
84 fp, fname = tempfile.mkstemp() |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
85 # The file pointer fp will be wrapped in a python file object |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
86 # so we can ensure there remain no open files. |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
87 f = open(fp, mode="wt") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
88 gene_files.append((gene, f, fname)) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
89 header = next(table) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
90 header_number_of_tabs = header.count('\t') |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
91 column_names = header.strip("\n").split("\t") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
92 fr1_columns = [index for index, column in enumerate(column_names) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
93 if column.startswith("FR1")] |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
94 sequence_id_index = column_names.index("Sequence ID") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
95 for _, gene_file, _ in gene_files: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
96 gene_file.write(header) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
97 for line in table: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
98 # IMGT sometimes delivers half-empty rows. |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
99 row_number_of_tabs = line.count("\t") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
100 missing_tabs = header_number_of_tabs - row_number_of_tabs |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
101 if missing_tabs: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
102 line = line.strip("\n") + missing_tabs * "\t" + "\n" |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
103 values = line.strip("\n").split("\t") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
104 sequence_id = values[sequence_id_index] |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
105 match = match_dict.get(sequence_id) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
106 if match is None: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
107 continue |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
108 if name.startswith("8_"): |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
109 # change the FR1 columns to 0 in the "8_..." file |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
110 for index in fr1_columns: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
111 values[index] = "0" |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
112 line = "\t".join(values) + "\n" |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
113 for gene, gene_file, _ in gene_files: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
114 if gene in match: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
115 gene_file.write(line) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
116 for gene_tarfile, (_, gene_file, fname) in zip(gene_tarfiles, gene_files): |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
117 gene_file.flush() |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
118 gene_tarfile.add(fname, name) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
119 gene_file.close() |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
120 os.remove(fname) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
121 for gene_tarfile in gene_tarfiles: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
122 gene_tarfile.close() |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
123 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
124 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
125 def argument_parser() -> argparse.ArgumentParser: |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
126 parser = argparse.ArgumentParser() |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
127 parser.add_argument("imgt_file", help="The original IMGT FILE") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
128 parser.add_argument("merged", help="merged.txt file") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
129 parser.add_argument("--outdir", help="output directory") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
130 parser.add_argument( |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
131 "genes", |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
132 nargs="+", |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
133 help="The genes to split out. Use '-' for all identified genes.") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
134 parser.add_argument("--prefix", help="Prefix for the archives and " |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
135 "directories") |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
136 return parser |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
137 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
138 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
139 def main(): |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
140 args = argument_parser().parse_args() |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
141 genes = ["" if gene == "-" else gene for gene in args.genes] |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
142 split_imgt(args.imgt_file, args.merged, args.outdir, genes, |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
143 args.prefix) |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
144 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
145 |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
146 if __name__ == "__main__": |
cf8ad181628f
planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff
changeset
|
147 main() |