comparison nf/subworkflows/ncbi/gnomon/chainer_wnode/main.nf @ 0:d9c5c5b87fec draft

planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
author fubar
date Sat, 03 Aug 2024 11:16:53 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d9c5c5b87fec
1 #!/usr/bin/env nextflow
2 nextflow.enable.dsl=2
3
4
5 include { merge_params } from '../../utilities'
6 include { run_align_sort } from '../../default/align_sort_sa/main.nf'
7
8 split_count=16
9
10
11 workflow chainer_wnode {
12 take:
13 alignments
14 hmm_params
15 evidence_denylist
16 gap_fill_allowlist
17 scaffolds
18 trusted_genes
19 genome
20 proteins
21 parameters // Map : extra parameter and parameter update
22 main:
23 String input_sorting = parameters.get('input_aligns_sort', '')
24 def sort_aligns = alignments
25 if (!input_sorting.contains("presorted")) {
26 String align_sort_params = ""
27 if (input_sorting.contains("merge_only")) {
28 align_sort_params = "-merge"
29 }
30 align_sort_params += " -ifmt seq-align -compression none -k subject,subject_start,-subject_end "
31 // print(align_sort_params)
32 sort_aligns = run_align_sort([], [], alignments, align_sort_params).collect()
33 //sort_aligns = align_sort(alignments, align_sort_params)
34 }
35 String submit_chainer_params = merge_params("-minimum-abut-margin 20 -separate-within-introns", parameters, 'submit_chainer')
36 String chainer_wnode_params = merge_params("", parameters, 'chainer_wnode')
37 String gpx_make_outputs_params = merge_params("-default-output-name chains -slices-for affinity -sort-by affinity", parameters, 'gpx_make_outputs')
38
39 def (jobs, lines_per_file) = generate_jobs(sort_aligns, submit_chainer_params)
40 def collected = run_chainer(jobs.flatten(), sort_aligns, hmm_params, evidence_denylist, gap_fill_allowlist, scaffolds, trusted_genes, genome, proteins, lines_per_file, chainer_wnode_params) | collect
41
42 run_gpx_make_outputs(collected, gpx_make_outputs_params)
43 emit:
44 chains = run_gpx_make_outputs.out.chains
45 chains_slices = run_gpx_make_outputs.out.chains_slices
46 evidence = run_gpx_make_outputs.out.evidence
47 evidence_slices = run_gpx_make_outputs.out.evidence_slices
48 }
49
50
51 process generate_jobs {
52 input:
53 path sort_aligns
54 val params
55 output:
56 path "job.*"
57 env lines_per_file
58 script:
59 njobs=split_count
60 """
61 #!/usr/bin/env bash
62 # generate_jobs $sort_aligns $params -output chains -output-slices chains_slices -output-evidence evidence -output-evidence-slices evidence_slices
63 submit_chainer $params -asn $sort_aligns -o jobs
64 total_lines=\$(wc -l <jobs)
65 (( lines_per_file = (total_lines + ${njobs} - 1) / ${njobs} ))
66 echo total_lines=\$total_lines, lines_per_file=\$lines_per_file
67 ####split -l\$lines_per_file jobs job. -da 3
68 # Use round robin to distribute jobs across nodes more evenly
69 if [ \$total_lines -lt $njobs ]; then
70 effective_njobs=\$total_lines
71 else
72 effective_njobs=$njobs
73 fi
74 split -nr/\$effective_njobs jobs job. -da 3
75 """
76 stub:
77 """
78 for i in {1..$split_count}; do
79 echo "<job query =\\\"lcl|${sort_aligns}:\${i}-\${i}\\\"></job>" >> jobs
80 done
81 split -nr/$split_count jobs job. -da 3
82 lines_per_file=10
83 """
84 }
85
86
87 process run_chainer {
88 input:
89 path job
90 path alignments
91 path hmm_params
92 path evidence_denylist
93 path gap_fill_allowlist
94 path scaffolds
95 path trusted_genes
96 path genome, stageAs: 'indexed/*'
97 path proteins_asn, stageAs: 'indexed/*'
98 val lines_per_file
99 val params
100 output:
101 path "output/*"
102 script:
103 job_num = job.toString().tokenize('.').last().toInteger()
104 """
105 echo "${evidence_denylist.join('\n')}" > evidence_denylist.mft
106 echo "${gap_fill_allowlist.join('\n')}" > gap_fill_allowlist.mft
107 echo "${scaffolds.join('\n')}" > scaffolds.mft
108 echo "${trusted_genes.join('\n')}" > trusted_genes.mft
109 # HACK: derive start_job_id from job file extension
110 filename=\$(basename -- "$job")
111 extension="\${filename##*.}"
112 (( start_job_id = ((10#\$extension) * $lines_per_file) + 1 ))
113
114 # make the local LDS of the genomic and protein (if present) sequences
115 lds2_indexer -source indexed -db LDS2
116
117 # When running multiple jobs on the cluster there is a chance that
118 # several jobs will run on the same node and thus generate files
119 # with the same filename. We need to avoid that to be able to stage
120 # the output files for gpx_make_outputs. We add the job file numeric
121 # extension as a prefix to the filename.
122 mkdir interim
123 chainer_wnode $params -start-job-id \$start_job_id -workers 32 -input-jobs ${job} -O interim -nogenbank -lds2 LDS2 -evidence-denylist-manifest evidence_denylist.mft -gap-fill-allowlist-manifest gap_fill_allowlist.mft -param ${hmm_params} -scaffolds-manifest scaffolds.mft -trusted-genes-manifest trusted_genes.mft
124 mkdir output
125 for f in interim/*; do
126 if [ -f \$f ]; then
127 mv \$f output/\${extension}_\$(basename \$f)
128 fi
129 done
130 """
131
132 stub:
133 job_num = job.toString().tokenize('.').last().toInteger()
134 """
135 mkdir -p output
136 touch output/sample_chainer_wnode.${job_num}.out
137 """
138 }
139
140
141 process run_gpx_make_outputs {
142 input:
143 path files, stageAs: "gpx_inputs/*"
144 val params
145 output:
146 path "output/chains.*.out.gz", emit: 'chains'
147 path "output/chains.*.out.gz.slices", emit: 'chains_slices'
148 path "output/evidence.*.out.gz", emit: 'evidence', optional: true
149 path "output/evidence.*.out.gz.slices", emit: 'evidence_slices', optional: true
150 script:
151 """
152 ls -1 gpx_inputs/* > gpx_inputs.mft
153 mkdir -p output
154 gpx_make_outputs $params -input-manifest gpx_inputs.mft -output output/@.#.out.gz -output-manifest output/@.mft -slices-manifest output/@_slices.mft -num-partitions $split_count
155 """
156 stub:
157 """
158 mkdir -p output
159 echo ${files}
160 for i in {1..$split_count}; do
161 touch output/chains.\$i.out.gz
162 touch output/chains.\$i.out.gz.slices
163 touch output/evidence.\$i.out.gz
164 touch output/evidence.\$i.out.gz.slices
165 done
166 """
167 }