comparison mtbls-dwnld @ 0:8dab200e02cb draft

"planemo upload commit 239561a6401593c5f87df40ac971a9aa393c4663-dirty"
author prog
date Tue, 07 Jan 2020 09:05:21 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8dab200e02cb
1 #!/bin/bash
2 # vi: fdm=marker
3
4 # Constants {{{1
5 ################################################################
6
7 PROG_NAME=$(basename $0)
8 PROG_PATH=$(dirname $0)
9 ISASLICER="$PROG_PATH/isaslicer.py"
10 YES=yes
11 NO=no
12 ASPERA_PUBLIC_TOKEN=Xz68YfDe
13 ASCP=ascp
14 WGET=wget
15 PLATFORM=
16 DISTRIBUTION=
17 [[ -z $(which uname) ]] || PLATFORM=$(uname)
18 [[ $PLATFORM == Linux && -e /proc/version ]] && DISTRIBUTION=$(sed 's/^.*(\([^ ]*\).*$/\1/' /proc/version)
19 [[ $DISTRIBUTION == Alpine ]] || WGET_FLAGS="--progress=dot"
20
21 # Global variables {{{1
22 ################################################################
23
24 ASPERA=
25 COMPRESSED=
26 DEBUG=0
27 FACTOR_VALUE=
28 HTML=
29 METADATA_ONLY=
30 OUTPUT=
31 PRIVATE=
32 QUIET=0
33 TMP_IN_OUTPUT=
34 TOKEN=
35
36 # Print help {{{1
37 ################################################################
38
39 function print_help {
40 echo "Usage: $PROG_NAME [options] study"
41 echo
42 echo "Retrieves a study from Metabolights database."
43 echo "By default it uses the ftp server, but with -a option you can ask for using aspera server (you will need the aspera client ascp to be installed)."
44 echo
45 echo "Options:"
46 echo " -a, --aspera Use aspera server for downloading. You need the ascp client to be installed, it is freely available for linux 64. See http://downloads.asperasoft.com."
47 echo " -c, --compressed Output in compressed format. If unset, the output will be a directory."
48 echo " -f, --factor-value Filter study on a factor value. Example: \"-f myfactor=myvalue\". Only available with wget downloader."
49 echo " -g, --debug Debug mode."
50 echo " -h, --help Print this help message."
51 echo " -H, --html FILE Write HTML file that list files contained in output directory. Not compatible with compressed output."
52 echo " -M, --metadata-only Download only the metadata (ISA-Tab files i_*, m_*, s_* and a_*) files. This option has no effet if aspera is selected (option -a)."
53 echo " -o, --output NAME Set the name of the output (both for directory output or compressed file output). By default the name of the output is determined by the download tool."
54 echo " -p, --private Indicate the study to download is not public. This is meant for aspera download, since the URL will be different for a public or a private study."
55 echo " -q, --quiet Does not print any output. Can be specified twice in order to be real quiet."
56 echo " -t, --token TOKEN Set the token or password to use. For aspera public download, if you don't specify a token, the default token '$ASPERA_PUBLIC_TOKEN' will be used."
57 echo " -T, --tmp-in-output If an output is specified and it is a directory (-c option must not be set), then use it for writing intermediate files."
58 }
59
60 # Error {{{1
61 ################################################################
62
63 function error {
64
65 local msg=$1
66
67 echo "ERROR: $msg" >&2
68
69 exit 1
70 }
71
72 # Debug {{{1
73 ################################################################
74
75 function debug {
76
77 local dbgmsg="$1"
78
79 [[ $DEBUG -ge 1 ]] && echo "[DEBUG] $dbgmsg" >&2
80 }
81
82
83 # Read args {{{1
84 ################################################################
85
86 function read_args {
87
88 local args="$*" # save arguments for debugging purpose
89
90 # Read options
91 while true ; do
92 shift_count=1
93 case $1 in
94 -a|--aspera) ASPERA=$YES ;;
95 -c|--compressed) COMPRESSED=$YES ;;
96 -f|--factor-value) FACTOR_VALUE="$2" ; shift_count=2 ;;
97 -g|--debug) DEBUG=$((DEBUG + 1)) ;;
98 -h|--help) print_help ; exit 0 ;;
99 -M|--metadata-only) METADATA_ONLY=$YES ;;
100 -H|--html) HTML="$2" ; shift_count=2 ;;
101 -o|--output) OUTPUT="$2" ; shift_count=2 ;;
102 -p|--private) PRIVATE=$YES ;;
103 -q|--quiet) QUIET=$((QUIET + 1)) ;;
104 -t|--token) TOKEN="$2" ; shift_count=2 ;;
105 -T|--tmp-in-output) TMP_IN_OUTPUT=$YES ;;
106 -) error "Illegal option $1." ;;
107 --) error "Illegal option $1." ;;
108 --*) error "Illegal option $1." ;;
109 -?) error "Unknown option $1." ;;
110 -[^-]*) split_opt=$(echo $1 | sed 's/^-//' | sed 's/\([a-zA-Z]\)/ -\1/g') ; set -- $1$split_opt "${@:2}" ;;
111 *) break
112 esac
113 shift $shift_count
114 done
115 shift $((OPTIND - 1))
116
117 # Read remaining arguments
118 [ $# -eq 1 ] || error "You must specify one, and only one, study to retrieve."
119 STUDY="$1"
120
121 # Check token
122 if [[ -n $ASPERA && -z $TOKEN ]] ; then
123 if [[ -z $PRIVATE ]] ; then
124 TOKEN=$ASPERA_PUBLIC_TOKEN
125 else
126 error "You need to specify a token for retrieving private studies with aspera."
127 fi
128 fi
129 [[ -z $PRIVATE || -n $TOKEN ]] || error "You need to set a token for retrieving private studies."
130
131 # Turn off --tmp-to-output if --compressed is set
132 [[ $TMP_IN_OUTPUT == $YES && ( $COMPRESSED == $YES || -z $OUTPUT ) ]] && TMP_IN_OUTPUT=$NO
133
134 # Debug
135 debug "Arguments are : $args"
136 debug "Study to retrieve is : $STUDY"
137 debug "ASPERA=$ASPERA"
138 debug "COMPRESSED=$COMPRESSED"
139 debug "DEBUG=$DEBUG"
140 debug "FACTOR_VALUE=$FACTOR_VALUE"
141 debug "HTML=$HTML"
142 debug "METADATA_ONLY=$METADATA_ONLY"
143 debug "OUTPUT=$OUTPUT"
144 debug "PRIVATE=$PRIVATE"
145 debug "QUIET=$QUIET"
146 debug "TMP_IN_OUTPUT=$TMP_IN_OUTPUT"
147 debug "TOKEN=$TOKEN"
148 [[ -n $ASPERA ]] && debug "Aspera will be used."
149 [[ -n $TOKEN ]] && debug "Token/Password is \"$TOKEN\"."
150 }
151
152 # Get download output path {{{1
153 ################################################################
154
155 get_download_output_path() {
156
157 local downloader="$1"
158 local study_name="$2"
159 local output_dir="$3"
160 local output_path="$study_name"
161
162 [[ -z $output_dir ]] || output_path="$output_dir/$output_path"
163 [[ $downloader == $WGET ]] && output_path+=".zip"
164
165 echo "$output_path"
166 }
167
168 # Download with ascp {{{1
169 ################################################################
170
171 download_with_ascp() {
172
173 local study_name="$1"
174 local metadata_only="$2"
175 local output_dir="$3"
176 local dwnld_flags=
177 local dwnld_link=
178
179 # Check ascp
180 [ -n "$(which ascp)" ] || error "ascp command not found. Please install Aspera client, version 3.7.4 or greater. See http://downloads.asperasoft.com/en/downloads/62."
181 debug "$ASCP: $(which $ASCP)"
182 debug "QUIET: $QUIET"
183
184 # Silence downloader output
185 [[ $QUIET -eq 0 ]] || dwnld_flags=-q
186
187 # Set download flags
188 [[ -z $dwnld_flags ]] || dwnld_flags+=" "
189 dwnld_flags+="--policy=fair -T -l 1g"
190
191 dwnld_flags+=" -P33001"
192
193 # Set download link
194 if [[ -z $PRIVATE ]] ; then
195
196 # Make full path for public study
197 study_path=$STUDY
198 if [[ -z ${study_path##MTBLS*} ]] ; then
199 study_path="/studies/public/$study_path"
200 fi
201 dwnld_link="fasp-ml@fasp.ebi.ac.uk:$study_path"
202
203 # Private study
204 else
205 dwnld_link="mtblight@hx-fasp-1.ebi.ac.uk:$STUDY"
206 fi
207
208 # Export token
209 debug "export ASPERA_SCP_PASS=\"$TOKEN\""
210 export ASPERA_SCP_PASS="$TOKEN"
211
212 # Run download command
213 if [[ -n $output_dir ]] ; then
214 mkdir -p "$output_dir"
215 curdir=$(pwd)
216 cd "$output_dir"
217 fi
218 if [[ $metadata_only == $YES ]] ; then
219 debug "Download command: $ASCP $dwnld_flags -N '?_*.t*' -E '*.*' -E 'p*' $dwnld_link ."
220 $ASCP $dwnld_flags -N '?_*.t*' -E '*.*' -E 'p*' $dwnld_link .
221 [[ $? == 0 ]] || error "Downloading of study $STUDY has failed. Error code returned is $?."
222 else
223 debug "Download command: $ASCP $dwnld_flags $dwnld_link ."
224 $ASCP $dwnld_flags $dwnld_link .
225 [[ $? == 0 ]] || error "Downloading of study $STUDY has failed. Error code returned is $?."
226 fi
227 [[ -z $output_dir ]] || cd "$curdir"
228 }
229
230 # Download with wget {{{1
231 ################################################################
232
233 download_with_wget() {
234
235 local study_name="$1"
236 local metadata_only="$2"
237 local output_dir="$3"
238 local dwnld_flags=
239
240 # Check wget
241 [ -n "$(which wget)" ] || error "wget command not found."
242 debug "$WGET: $(which $WGET)"
243
244 # Set download link
245 file="$study_name"
246 [[ $metadata_only == $YES ]] && file=metadata
247 dwnld_link="https://www.ebi.ac.uk/metabolights/$study_name/files/$file"
248 [[ -n $TOKEN ]] && dwnld_link+="?token=$TOKEN"
249
250 # Set download output
251 output_path="$study_name.zip"
252 if [[ -n $output_dir ]] ; then
253 mkdir -p "$output_dir"
254 output_path="$output_dir/$output_path"
255 fi
256 dwnld_flags="-O $output_path"
257
258 # Silence downloader output
259 wget_log_file=$(mktemp -t $PROG_NAME.XXXXXX)
260 [[ $QUIET -eq 0 ]] || dwnld_flags+=" -q -o $wget_log_file"
261
262 # Run download command
263 debug "Download command: $WGET $WGET_FLAGS $dwnld_flags $dwnld_link"
264 $WGET $WGET_FLAGS $dwnld_flags $dwnld_link
265 [[ $? == 0 ]] || error "Downloading of study $STUDY has failed. wget log file: $(cat $wget_log_file)"
266 rm -f $wget_log_file
267 }
268
269 # Download {{{1
270 ################################################################
271
272 download() {
273
274 local downloader="$1"
275 local study_name="$2"
276 local metadata_only="$3"
277 local output_dir="$4"
278
279 if [[ $downloader == $WGET ]] ; then
280 download_with_wget "$2" "$3" "$output_dir"
281 else
282 download_with_ascp "$2" "$3" "$output_dir"
283 fi
284 }
285
286 # Check download {{{1
287 ################################################################
288
289 check_download() {
290
291 local download_path="$1"
292
293 [[ -d $download_path || -f $download_path ]] || error "The downloading of $tmp_output failed. The output file doesn't exist."
294 [[ ! -f $download_path || -s $download_path ]] || error "The downloading of $tmp_output failed. The output file is empty."
295 }
296
297 # Unzip study {{{1
298 ################################################################
299
300 unzip_study() {
301
302 local tmp_output="$1"
303 local output="$2"
304 local unzip_flags=
305
306 # Silence unzip program
307 [[ $QUIET -eq 0 ]] || unzip_flags=-qq
308
309 debug "Unzipping file \"$tmp_output\"."
310 [[ -z $output ]] || debug "Unzipping into \"$output\"."
311
312 [[ -n $(which unzip) ]] || error "unzip command not found. Please install zip package."
313
314 zip=$tmp_output
315 debug "Zipped file is \"$zip\"."
316 if [[ -z $output ]] ; then
317 output=${zip%.*}
318 [[ -d "$output" ]] && rm -r "$output"
319 fi
320 debug "Output directory will be \"$output\"."
321 zip_abs_path=$(realpath "$zip")
322 curdir=$(pwd)
323 debug "Current directory is \"$curdir\"."
324 mkdir -p "$output"
325 cd "$output"
326 unzip $unzip_flags "$zip_abs_path" >&2 || error "Unable to unzip archive $zip_abs_path."
327 cd "$curdir"
328
329 echo "$output"
330 }
331
332 # As zip {{{1
333 ################################################################
334
335 as_zip() {
336
337 local path="$1"
338
339 if [[ -d $path ]] ; then
340
341 zip_file="$path.zip"
342 zip "$zip_file" $path/*
343 rm -r "$path"
344 path="$zip_file"
345 fi
346
347 echo "$path"
348 }
349
350 # As folder {{{1
351 ################################################################
352
353 as_folder() {
354
355 local path="$1"
356 local output="$2"
357
358 debug "as_folder($path, $output)"
359 if [[ -f $path && ${path##*.} == 'zip' ]] ; then
360 folder=$(unzip_study "$path" "$output")
361 rm "$path"
362 path="$folder"
363 elif [[ -d $path && -n $output && $path != $output ]] ; then
364 mkdir -p "$output"
365 mv "$path"/* "$output"/.
366 rm -r "$path"
367 path="$output"
368 fi
369
370 echo "$path"
371 }
372
373 # Make folder or zip {{{1
374 ################################################################
375
376 make_folder_or_zip() {
377
378 local path="$1"
379 local compressed="$2"
380 local output="$3"
381
382 debug "make_folder_or_zip($path, $compressed, $output)"
383 [[ -e $path ]] || error "No file or folder at path \"$path\"."
384
385 # Compress folder
386 if [[ $compressed == $YES ]] ; then
387 path=$(as_zip "$path")
388 else
389 path=$(as_folder "$path" "$output")
390 fi
391
392 echo "$path"
393 }
394
395 # Write HTML file {{{1
396 ################################################################
397
398 function write_html_file {
399
400 local HTML=$1
401 local tmp_output=$2
402
403 cat >$HTML <<EOF
404 <html>
405 <header>
406 <title>Metabolights study</title>
407 </header>
408 <body>
409 <a href="i_Investigation.txt">Investigation file</a><br/>
410 EOF
411
412 echo "<br/>" >>$HTML
413 echo " Study files:<br/>" >>$HTML
414 for f in $tmp_output/s_* ; do
415 filename=$(basename "$f")
416 echo " <a href=\"$filename\">$filename</a><br/>" >>$HTML
417 done
418
419 echo "<br/>" >>$HTML
420 echo " Assay files:<br/>" >>$HTML
421 for f in $tmp_output/a_* ; do
422 filename=$(basename "$f")
423 echo " <a href=\"$filename\">$filename</a><br/>" >>$HTML
424 done
425
426 echo "<br/>" >>$HTML
427 echo " Data files:<br/>" >>$HTML
428 for f in $tmp_output/m_* ; do
429 filename=$(basename "$f")
430 echo " <a href=\"$filename\">$filename</a><br/>" >>$HTML
431 done
432
433 cat >>$HTML <<EOF
434 </body>
435 </html>
436 EOF
437 }
438
439 # Get data files {{{1
440 ################################################################
441
442 get_data_files() {
443
444 local json_file="$1"
445
446 python3 <<EOF
447 # @@@BEGIN_PYTHON@@@
448 import json
449 import sys
450 with open('$json_file') as f:
451 data_list = json.load(f)
452 for elem in data_list:
453 print("\n".join(elem['data_files']))
454 # @@@END_PYTHON@@@
455 EOF
456 }
457
458 # MAIN {{{1
459 ################################################################
460
461 read_args "$@"
462
463 study_name=$(basename $STUDY)
464 downloader=$WGET
465 output_dir=
466 [[ $ASPERA == $YES ]] && downloader=$ASCP
467 [[ $TMP_IN_OUTPUT != $YES ]] || output_dir="$OUTPUT"
468
469 # Download only part of the study using factor value
470 if [[ -n $FACTOR_VALUE ]] ; then
471
472 # Get factor name and value
473 factor_name=${FACTOR_VALUE%%=*}
474 factor_value=${FACTOR_VALUE#*=}
475
476 # Download only metadata
477 download "$downloader" "$study_name" "$YES" # XXX Download output must be written into $OUTPUT if TMP_IN_OUTPUT is set
478 dwnld_output=$(get_download_output_path "$downloader" "$study_name")
479 dwnld_output=$(as_folder "$dwnld_output")
480
481 # Get data files to download
482 data_files=$(mktemp -t $PROG_NAME.XXXXXX) # XXX must be created into $OUTPUT if TMP_IN_OUTPUT is set
483 [[ -x "$ISASLICER" ]] || error "Cannot find or run isaslicer.py script."
484 debug "Run ISA slicer: \"$ISASLICER\" 'isa-tab-get-data-list' \"$abs_dwnld_output\" \"$data_files\" --json-query \"{ \\\"$factor_name\\\": \\\"$factor_value\\\" }\""
485 abs_dwnld_output=$(realpath "$dwnld_output")
486 "$ISASLICER" --log-level DEBUG 'isa-tab-get-data-list' "$abs_dwnld_output" "$data_files" --json-query "{ \"$factor_name\": \"$factor_value\" }" 2>&1 || error "Call to isaslicer failed."
487
488 # Download data files
489 wget_log_file=$(mktemp -t $PROG_NAME.XXXXXX)
490 get_data_files "$data_files" | sort | uniq | while read file ; do
491 if [[ -n $file ]] ; then
492 dwnld_flags=
493 [[ $QUIET -eq 0 ]] || dwnld_flags+=" -q -o $wget_log_file"
494 dwnld_link="https://www.ebi.ac.uk/metabolights/$study_name/files/$file"
495 debug "Download command: $WGET $WGET_FLAGS $dwnld_flags $dwnld_link"
496 $WGET $WGET_FLAGS $dwnld_flags -O "$dwnld_output/$file" "$dwnld_link" # XXX Download output must be written into $OUTPUT if TMP_IN_OUTPUT is set
497 [[ $? == 0 ]] || error "Downlad of study file \"$file\" has failed. wget log file: $(cat $wget_log_file)"
498 fi
499 done
500 rm -f $wget_log_file
501
502 # Remove data files list
503 rm "$data_files"
504
505 # Download whole study
506 else
507
508 # Download whole study
509 download "$downloader" "$study_name" "$METADATA_ONLY" "$output_dir" # XXX Download output must be written into $OUTPUT if TMP_IN_OUTPUT is set
510
511 # Get output path
512 dwnld_output=$(get_download_output_path "$downloader" "$study_name" "$output_dir") # XXX Correct output path when TMP_IN_OUTPUT is set
513 fi
514
515 # Check output
516 check_download "$dwnld_output"
517
518 # Output in right format (zipped or folder)
519 dwnld_output=$(make_folder_or_zip "$dwnld_output" "$COMPRESSED" "$output_dir") # XXX zip must be written into $OUTPUT and unzipped into $OUTPUT if TMP_IN_OUTPUT is set
520
521 # Output HTML
522 [[ -z $HTML || ! -d $dwnld_output ]] || write_html_file "$HTML" "$dwnld_output"
523
524 # Rename output
525 [[ -z $OUTPUT || $TMP_IN_OUTPUT == $YES ]] || mv "$dwnld_output" "$OUTPUT"