Mercurial > repos > fubar > egapx_runner
comparison nf/subworkflows/ncbi/gnomon/chainer_wnode/main.nf @ 0:d9c5c5b87fec draft
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
author | fubar |
---|---|
date | Sat, 03 Aug 2024 11:16:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d9c5c5b87fec |
---|---|
1 #!/usr/bin/env nextflow | |
2 nextflow.enable.dsl=2 | |
3 | |
4 | |
5 include { merge_params } from '../../utilities' | |
6 include { run_align_sort } from '../../default/align_sort_sa/main.nf' | |
7 | |
8 split_count=16 | |
9 | |
10 | |
11 workflow chainer_wnode { | |
12 take: | |
13 alignments | |
14 hmm_params | |
15 evidence_denylist | |
16 gap_fill_allowlist | |
17 scaffolds | |
18 trusted_genes | |
19 genome | |
20 proteins | |
21 parameters // Map : extra parameter and parameter update | |
22 main: | |
23 String input_sorting = parameters.get('input_aligns_sort', '') | |
24 def sort_aligns = alignments | |
25 if (!input_sorting.contains("presorted")) { | |
26 String align_sort_params = "" | |
27 if (input_sorting.contains("merge_only")) { | |
28 align_sort_params = "-merge" | |
29 } | |
30 align_sort_params += " -ifmt seq-align -compression none -k subject,subject_start,-subject_end " | |
31 // print(align_sort_params) | |
32 sort_aligns = run_align_sort([], [], alignments, align_sort_params).collect() | |
33 //sort_aligns = align_sort(alignments, align_sort_params) | |
34 } | |
35 String submit_chainer_params = merge_params("-minimum-abut-margin 20 -separate-within-introns", parameters, 'submit_chainer') | |
36 String chainer_wnode_params = merge_params("", parameters, 'chainer_wnode') | |
37 String gpx_make_outputs_params = merge_params("-default-output-name chains -slices-for affinity -sort-by affinity", parameters, 'gpx_make_outputs') | |
38 | |
39 def (jobs, lines_per_file) = generate_jobs(sort_aligns, submit_chainer_params) | |
40 def collected = run_chainer(jobs.flatten(), sort_aligns, hmm_params, evidence_denylist, gap_fill_allowlist, scaffolds, trusted_genes, genome, proteins, lines_per_file, chainer_wnode_params) | collect | |
41 | |
42 run_gpx_make_outputs(collected, gpx_make_outputs_params) | |
43 emit: | |
44 chains = run_gpx_make_outputs.out.chains | |
45 chains_slices = run_gpx_make_outputs.out.chains_slices | |
46 evidence = run_gpx_make_outputs.out.evidence | |
47 evidence_slices = run_gpx_make_outputs.out.evidence_slices | |
48 } | |
49 | |
50 | |
51 process generate_jobs { | |
52 input: | |
53 path sort_aligns | |
54 val params | |
55 output: | |
56 path "job.*" | |
57 env lines_per_file | |
58 script: | |
59 njobs=split_count | |
60 """ | |
61 #!/usr/bin/env bash | |
62 # generate_jobs $sort_aligns $params -output chains -output-slices chains_slices -output-evidence evidence -output-evidence-slices evidence_slices | |
63 submit_chainer $params -asn $sort_aligns -o jobs | |
64 total_lines=\$(wc -l <jobs) | |
65 (( lines_per_file = (total_lines + ${njobs} - 1) / ${njobs} )) | |
66 echo total_lines=\$total_lines, lines_per_file=\$lines_per_file | |
67 ####split -l\$lines_per_file jobs job. -da 3 | |
68 # Use round robin to distribute jobs across nodes more evenly | |
69 if [ \$total_lines -lt $njobs ]; then | |
70 effective_njobs=\$total_lines | |
71 else | |
72 effective_njobs=$njobs | |
73 fi | |
74 split -nr/\$effective_njobs jobs job. -da 3 | |
75 """ | |
76 stub: | |
77 """ | |
78 for i in {1..$split_count}; do | |
79 echo "<job query =\\\"lcl|${sort_aligns}:\${i}-\${i}\\\"></job>" >> jobs | |
80 done | |
81 split -nr/$split_count jobs job. -da 3 | |
82 lines_per_file=10 | |
83 """ | |
84 } | |
85 | |
86 | |
87 process run_chainer { | |
88 input: | |
89 path job | |
90 path alignments | |
91 path hmm_params | |
92 path evidence_denylist | |
93 path gap_fill_allowlist | |
94 path scaffolds | |
95 path trusted_genes | |
96 path genome, stageAs: 'indexed/*' | |
97 path proteins_asn, stageAs: 'indexed/*' | |
98 val lines_per_file | |
99 val params | |
100 output: | |
101 path "output/*" | |
102 script: | |
103 job_num = job.toString().tokenize('.').last().toInteger() | |
104 """ | |
105 echo "${evidence_denylist.join('\n')}" > evidence_denylist.mft | |
106 echo "${gap_fill_allowlist.join('\n')}" > gap_fill_allowlist.mft | |
107 echo "${scaffolds.join('\n')}" > scaffolds.mft | |
108 echo "${trusted_genes.join('\n')}" > trusted_genes.mft | |
109 # HACK: derive start_job_id from job file extension | |
110 filename=\$(basename -- "$job") | |
111 extension="\${filename##*.}" | |
112 (( start_job_id = ((10#\$extension) * $lines_per_file) + 1 )) | |
113 | |
114 # make the local LDS of the genomic and protein (if present) sequences | |
115 lds2_indexer -source indexed -db LDS2 | |
116 | |
117 # When running multiple jobs on the cluster there is a chance that | |
118 # several jobs will run on the same node and thus generate files | |
119 # with the same filename. We need to avoid that to be able to stage | |
120 # the output files for gpx_make_outputs. We add the job file numeric | |
121 # extension as a prefix to the filename. | |
122 mkdir interim | |
123 chainer_wnode $params -start-job-id \$start_job_id -workers 32 -input-jobs ${job} -O interim -nogenbank -lds2 LDS2 -evidence-denylist-manifest evidence_denylist.mft -gap-fill-allowlist-manifest gap_fill_allowlist.mft -param ${hmm_params} -scaffolds-manifest scaffolds.mft -trusted-genes-manifest trusted_genes.mft | |
124 mkdir output | |
125 for f in interim/*; do | |
126 if [ -f \$f ]; then | |
127 mv \$f output/\${extension}_\$(basename \$f) | |
128 fi | |
129 done | |
130 """ | |
131 | |
132 stub: | |
133 job_num = job.toString().tokenize('.').last().toInteger() | |
134 """ | |
135 mkdir -p output | |
136 touch output/sample_chainer_wnode.${job_num}.out | |
137 """ | |
138 } | |
139 | |
140 | |
141 process run_gpx_make_outputs { | |
142 input: | |
143 path files, stageAs: "gpx_inputs/*" | |
144 val params | |
145 output: | |
146 path "output/chains.*.out.gz", emit: 'chains' | |
147 path "output/chains.*.out.gz.slices", emit: 'chains_slices' | |
148 path "output/evidence.*.out.gz", emit: 'evidence', optional: true | |
149 path "output/evidence.*.out.gz.slices", emit: 'evidence_slices', optional: true | |
150 script: | |
151 """ | |
152 ls -1 gpx_inputs/* > gpx_inputs.mft | |
153 mkdir -p output | |
154 gpx_make_outputs $params -input-manifest gpx_inputs.mft -output output/@.#.out.gz -output-manifest output/@.mft -slices-manifest output/@_slices.mft -num-partitions $split_count | |
155 """ | |
156 stub: | |
157 """ | |
158 mkdir -p output | |
159 echo ${files} | |
160 for i in {1..$split_count}; do | |
161 touch output/chains.\$i.out.gz | |
162 touch output/chains.\$i.out.gz.slices | |
163 touch output/evidence.\$i.out.gz | |
164 touch output/evidence.\$i.out.gz.slices | |
165 done | |
166 """ | |
167 } |